/*
 * Copyright (c) 2003 Matteo Frigo
 * Copyright (c) 2003 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Sat Apr 19 18:02:43 EDT 2003 */

#include "codelet-dft.h"

/* Generated by: /homea/athena/fftw3/genfft/gen_notw_noinline_c -simd -trivial-stores -compact -variables 4 -sign 1 -n 64 -name m1bv_64 -include n1b.h */

/*
 * This function contains 456 FP additions, 124 FP multiplications,
 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
 * 111 stack variables, and 128 memory accesses
 */
/*
 * Generator Id's : 
 * $Id: algsimp.ml,v 1.7 2003/03/15 20:29:42 stevenj Exp $
 * $Id: fft.ml,v 1.2 2003/03/15 20:29:42 stevenj Exp $
 * $Id: gen_notw_noinline_c.ml,v 1.1 2003/04/17 10:44:21 athena Exp $
 */

#include "n1b.h"

static void m1bv_64_0(const R *xi, R *xo, stride is, stride os, int ivs, int ovs)
{
     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
     {
	  V Tr, T2P, T7n, T83, T7s, T82, T2r, T31, T3N, T4r, T53, T6p, T61, T6B, T3S;
	  V T4u, T7Y, T7Z, TI, T2g, T7k, T7t, TZ, T2h, T3J, T3U, T5W, T6q, T3G, T3T;
	  V T5i, T6A, T7R, T7S, T7T, T5z, T6t, T75, T7v, T1s, T2S, T1B, T2T, T3u, T4k;
	  V T5u, T6s, T3r, T4j, T7U, T7V, T7W, T5Q, T6w, T7c, T7w, T23, T2V, T2c, T2W;
	  V T3B, T4n, T5L, T6v, T3y, T4m;
	  {
	       V T3, T4T, T2p, T4U, Ta, T5Z, T2m, T5Y, T4W, T4X, Ti, T4Y, T2j, T50, T4Z;
	       V Tp, T51, T2k;
	       {
		    V T1, T2, T2n, T2o;
		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
		    T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
		    T3 = VSUB(T1, T2);
		    T4T = VADD(T1, T2);
		    T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
		    T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
		    T2p = VSUB(T2n, T2o);
		    T4U = VADD(T2n, T2o);
	       }
	       {
		    V T4, T5, T6, T7, T8, T9;
		    T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
		    T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
		    T6 = VSUB(T4, T5);
		    T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
		    T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
		    T9 = VSUB(T7, T8);
		    Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
		    T5Z = VADD(T7, T8);
		    T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
		    T5Y = VADD(T4, T5);
	       }
	       {
		    V Te, Th, Tl, To;
		    {
			 V Tc, Td, Tf, Tg;
			 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
			 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
			 Te = VSUB(Tc, Td);
			 T4W = VADD(Tc, Td);
			 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
			 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
			 Th = VSUB(Tf, Tg);
			 T4X = VADD(Tf, Tg);
		    }
		    Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
		    T4Y = VSUB(T4W, T4X);
		    T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
		    {
			 V Tj, Tk, Tm, Tn;
			 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
			 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
			 Tl = VSUB(Tj, Tk);
			 T50 = VADD(Tj, Tk);
			 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
			 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
			 To = VSUB(Tm, Tn);
			 T4Z = VADD(Tm, Tn);
		    }
		    Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
		    T51 = VSUB(T4Z, T50);
		    T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
	       }
	       {
		    V Tb, Tq, T7l, T7m;
		    Tb = VSUB(T3, Ta);
		    Tq = VSUB(Ti, Tp);
		    Tr = VSUB(Tb, Tq);
		    T2P = VADD(Tb, Tq);
		    T7l = VADD(T4W, T4X);
		    T7m = VADD(T4Z, T50);
		    T7n = VSUB(T7l, T7m);
		    T83 = VADD(T7l, T7m);
	       }
	       {
		    V T7q, T7r, T2l, T2q;
		    T7q = VADD(T4T, T4U);
		    T7r = VADD(T5Y, T5Z);
		    T7s = VSUB(T7q, T7r);
		    T82 = VADD(T7q, T7r);
		    T2l = VSUB(T2j, T2k);
		    T2q = VSUB(T2m, T2p);
		    T2r = VSUB(T2l, T2q);
		    T31 = VADD(T2q, T2l);
	       }
	       {
		    V T3L, T3M, T4V, T52;
		    T3L = VADD(T2p, T2m);
		    T3M = VADD(Ti, Tp);
		    T3N = VADD(T3L, T3M);
		    T4r = VSUB(T3M, T3L);
		    T4V = VSUB(T4T, T4U);
		    T52 = VMUL(LDK(KP707106781), VADD(T4Y, T51));
		    T53 = VSUB(T4V, T52);
		    T6p = VADD(T4V, T52);
	       }
	       {
		    V T5X, T60, T3Q, T3R;
		    T5X = VMUL(LDK(KP707106781), VSUB(T4Y, T51));
		    T60 = VSUB(T5Y, T5Z);
		    T61 = VSUB(T5X, T60);
		    T6B = VADD(T60, T5X);
		    T3Q = VADD(T3, Ta);
		    T3R = VADD(T2j, T2k);
		    T3S = VADD(T3Q, T3R);
		    T4u = VSUB(T3Q, T3R);
	       }
	  }
	  {
	       V TB, T7e, TF, T56, TS, T7h, TW, T5g, Ty, T7f, TG, T59, TP, T7i, TX;
	       V T5d, T5U, T5V;
	       {
		    V Tz, TA, T55, TD, TE, T54;
		    Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
		    TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
		    T55 = VADD(Tz, TA);
		    TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
		    T54 = VADD(TD, TE);
		    TB = VSUB(Tz, TA);
		    T7e = VADD(T54, T55);
		    TF = VSUB(TD, TE);
		    T56 = VSUB(T54, T55);
	       }
	       {
		    V TQ, TR, T5f, TU, TV, T5e;
		    TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
		    TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
		    T5f = VADD(TQ, TR);
		    TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
		    TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
		    T5e = VADD(TU, TV);
		    TS = VSUB(TQ, TR);
		    T7h = VADD(T5e, T5f);
		    TW = VSUB(TU, TV);
		    T5g = VSUB(T5e, T5f);
	       }
	       {
		    V Tu, T57, Tx, T58;
		    {
			 V Ts, Tt, Tv, Tw;
			 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
			 Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
			 Tu = VSUB(Ts, Tt);
			 T57 = VADD(Ts, Tt);
			 Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
			 Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
			 Tx = VSUB(Tv, Tw);
			 T58 = VADD(Tv, Tw);
		    }
		    Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
		    T7f = VADD(T57, T58);
		    TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
		    T59 = VSUB(T57, T58);
	       }
	       {
		    V TL, T5b, TO, T5c;
		    {
			 V TJ, TK, TM, TN;
			 TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
			 TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
			 TL = VSUB(TJ, TK);
			 T5b = VADD(TJ, TK);
			 TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
			 TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
			 TO = VSUB(TM, TN);
			 T5c = VADD(TM, TN);
		    }
		    TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
		    T7i = VADD(T5b, T5c);
		    TX = VMUL(LDK(KP707106781), VADD(TL, TO));
		    T5d = VSUB(T5b, T5c);
	       }
	       T7Y = VADD(T7e, T7f);
	       T7Z = VADD(T7h, T7i);
	       {
		    V TC, TH, T7g, T7j;
		    TC = VSUB(Ty, TB);
		    TH = VSUB(TF, TG);
		    TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
		    T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
		    T7g = VSUB(T7e, T7f);
		    T7j = VSUB(T7h, T7i);
		    T7k = VMUL(LDK(KP707106781), VSUB(T7g, T7j));
		    T7t = VMUL(LDK(KP707106781), VADD(T7g, T7j));
	       }
	       {
		    V TT, TY, T3H, T3I;
		    TT = VSUB(TP, TS);
		    TY = VSUB(TW, TX);
		    TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
		    T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
		    T3H = VADD(TS, TP);
		    T3I = VADD(TW, TX);
		    T3J = VFNMS(LDK(KP195090322), T3I, VMUL(LDK(KP980785280), T3H));
		    T3U = VFMA(LDK(KP195090322), T3H, VMUL(LDK(KP980785280), T3I));
	       }
	       T5U = VFNMS(LDK(KP382683432), T59, VMUL(LDK(KP923879532), T56));
	       T5V = VFMA(LDK(KP923879532), T5g, VMUL(LDK(KP382683432), T5d));
	       T5W = VSUB(T5U, T5V);
	       T6q = VADD(T5U, T5V);
	       {
		    V T3E, T3F, T5a, T5h;
		    T3E = VADD(TB, Ty);
		    T3F = VADD(TF, TG);
		    T3G = VFMA(LDK(KP980785280), T3E, VMUL(LDK(KP195090322), T3F));
		    T3T = VFNMS(LDK(KP195090322), T3E, VMUL(LDK(KP980785280), T3F));
		    T5a = VFMA(LDK(KP382683432), T56, VMUL(LDK(KP923879532), T59));
		    T5h = VFNMS(LDK(KP382683432), T5g, VMUL(LDK(KP923879532), T5d));
		    T5i = VSUB(T5a, T5h);
		    T6A = VADD(T5a, T5h);
	       }
	  }
	  {
	       V T5w, T5v, T1q, T6Z, T1v, T5r, T5s, T1n, T70, T1w, T18, T72, T1y, T5m, T1f;
	       V T73, T1z, T5p, T1j, T1m, T5x, T5y;
	       {
		    V T1o, T1p, T1t, T1u;
		    T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
		    T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
		    T5w = VADD(T1o, T1p);
		    T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
		    T5v = VADD(T1t, T1u);
		    T1q = VSUB(T1o, T1p);
		    T6Z = VADD(T5v, T5w);
		    T1v = VSUB(T1t, T1u);
	       }
	       {
		    V T1h, T1i, T1k, T1l;
		    T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
		    T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
		    T1j = VSUB(T1h, T1i);
		    T5r = VADD(T1h, T1i);
		    T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
		    T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
		    T1m = VSUB(T1k, T1l);
		    T5s = VADD(T1k, T1l);
	       }
	       T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
	       T70 = VADD(T5r, T5s);
	       T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
	       {
		    V T14, T5k, T17, T5l;
		    {
			 V T12, T13, T15, T16;
			 T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
			 T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
			 T14 = VSUB(T12, T13);
			 T5k = VADD(T12, T13);
			 T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
			 T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
			 T17 = VSUB(T15, T16);
			 T5l = VADD(T15, T16);
		    }
		    T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
		    T72 = VADD(T5k, T5l);
		    T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
		    T5m = VSUB(T5k, T5l);
	       }
	       {
		    V T1b, T5n, T1e, T5o;
		    {
			 V T19, T1a, T1c, T1d;
			 T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
			 T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
			 T1b = VSUB(T19, T1a);
			 T5n = VADD(T19, T1a);
			 T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
			 T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
			 T1e = VSUB(T1c, T1d);
			 T5o = VADD(T1c, T1d);
		    }
		    T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
		    T73 = VADD(T5n, T5o);
		    T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
		    T5p = VSUB(T5n, T5o);
	       }
	       T7R = VADD(T6Z, T70);
	       T7S = VADD(T72, T73);
	       T7T = VSUB(T7R, T7S);
	       T5x = VSUB(T5v, T5w);
	       T5y = VMUL(LDK(KP707106781), VADD(T5m, T5p));
	       T5z = VSUB(T5x, T5y);
	       T6t = VADD(T5x, T5y);
	       {
		    V T71, T74, T1g, T1r;
		    T71 = VSUB(T6Z, T70);
		    T74 = VSUB(T72, T73);
		    T75 = VFNMS(LDK(KP382683432), T74, VMUL(LDK(KP923879532), T71));
		    T7v = VFMA(LDK(KP382683432), T71, VMUL(LDK(KP923879532), T74));
		    T1g = VSUB(T18, T1f);
		    T1r = VSUB(T1n, T1q);
		    T1s = VSUB(T1g, T1r);
		    T2S = VADD(T1r, T1g);
	       }
	       {
		    V T1x, T1A, T3s, T3t;
		    T1x = VSUB(T1v, T1w);
		    T1A = VSUB(T1y, T1z);
		    T1B = VSUB(T1x, T1A);
		    T2T = VADD(T1x, T1A);
		    T3s = VADD(T1q, T1n);
		    T3t = VADD(T1y, T1z);
		    T3u = VADD(T3s, T3t);
		    T4k = VSUB(T3t, T3s);
	       }
	       {
		    V T5q, T5t, T3p, T3q;
		    T5q = VMUL(LDK(KP707106781), VSUB(T5m, T5p));
		    T5t = VSUB(T5r, T5s);
		    T5u = VSUB(T5q, T5t);
		    T6s = VADD(T5t, T5q);
		    T3p = VADD(T1v, T1w);
		    T3q = VADD(T18, T1f);
		    T3r = VADD(T3p, T3q);
		    T4j = VSUB(T3p, T3q);
	       }
	  }
	  {
	       V T5N, T5M, T21, T76, T26, T5I, T5J, T1Y, T77, T27, T1J, T79, T29, T5D, T1Q;
	       V T7a, T2a, T5G, T1U, T1X, T5O, T5P;
	       {
		    V T1Z, T20, T24, T25;
		    T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
		    T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
		    T5N = VADD(T1Z, T20);
		    T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
		    T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
		    T5M = VADD(T24, T25);
		    T21 = VSUB(T1Z, T20);
		    T76 = VADD(T5M, T5N);
		    T26 = VSUB(T24, T25);
	       }
	       {
		    V T1S, T1T, T1V, T1W;
		    T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
		    T1U = VSUB(T1S, T1T);
		    T5I = VADD(T1S, T1T);
		    T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
		    T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
		    T1X = VSUB(T1V, T1W);
		    T5J = VADD(T1V, T1W);
	       }
	       T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
	       T77 = VADD(T5I, T5J);
	       T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
	       {
		    V T1F, T5B, T1I, T5C;
		    {
			 V T1D, T1E, T1G, T1H;
			 T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
			 T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
			 T1F = VSUB(T1D, T1E);
			 T5B = VADD(T1D, T1E);
			 T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
			 T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
			 T1I = VSUB(T1G, T1H);
			 T5C = VADD(T1G, T1H);
		    }
		    T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
		    T79 = VADD(T5B, T5C);
		    T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
		    T5D = VSUB(T5B, T5C);
	       }
	       {
		    V T1M, T5E, T1P, T5F;
		    {
			 V T1K, T1L, T1N, T1O;
			 T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
			 T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
			 T1M = VSUB(T1K, T1L);
			 T5E = VADD(T1K, T1L);
			 T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
			 T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
			 T1P = VSUB(T1N, T1O);
			 T5F = VADD(T1N, T1O);
		    }
		    T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
		    T7a = VADD(T5E, T5F);
		    T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
		    T5G = VSUB(T5E, T5F);
	       }
	       T7U = VADD(T76, T77);
	       T7V = VADD(T79, T7a);
	       T7W = VSUB(T7U, T7V);
	       T5O = VSUB(T5M, T5N);
	       T5P = VMUL(LDK(KP707106781), VADD(T5D, T5G));
	       T5Q = VSUB(T5O, T5P);
	       T6w = VADD(T5O, T5P);
	       {
		    V T78, T7b, T1R, T22;
		    T78 = VSUB(T76, T77);
		    T7b = VSUB(T79, T7a);
		    T7c = VFMA(LDK(KP923879532), T78, VMUL(LDK(KP382683432), T7b));
		    T7w = VFNMS(LDK(KP382683432), T78, VMUL(LDK(KP923879532), T7b));
		    T1R = VSUB(T1J, T1Q);
		    T22 = VSUB(T1Y, T21);
		    T23 = VSUB(T1R, T22);
		    T2V = VADD(T22, T1R);
	       }
	       {
		    V T28, T2b, T3z, T3A;
		    T28 = VSUB(T26, T27);
		    T2b = VSUB(T29, T2a);
		    T2c = VSUB(T28, T2b);
		    T2W = VADD(T28, T2b);
		    T3z = VADD(T21, T1Y);
		    T3A = VADD(T29, T2a);
		    T3B = VADD(T3z, T3A);
		    T4n = VSUB(T3A, T3z);
	       }
	       {
		    V T5H, T5K, T3w, T3x;
		    T5H = VMUL(LDK(KP707106781), VSUB(T5D, T5G));
		    T5K = VSUB(T5I, T5J);
		    T5L = VSUB(T5H, T5K);
		    T6v = VADD(T5K, T5H);
		    T3w = VADD(T26, T27);
		    T3x = VADD(T1J, T1Q);
		    T3y = VADD(T3w, T3x);
		    T4m = VSUB(T3w, T3x);
	       }
	  }
	  {
	       V T81, T89, T86, T8a;
	       {
		    V T7X, T80, T84, T85;
		    T7X = VMUL(LDK(KP707106781), VSUB(T7T, T7W));
		    T80 = VSUB(T7Y, T7Z);
		    T81 = VBYI(VSUB(T7X, T80));
		    T89 = VBYI(VADD(T80, T7X));
		    T84 = VSUB(T82, T83);
		    T85 = VMUL(LDK(KP707106781), VADD(T7T, T7W));
		    T86 = VSUB(T84, T85);
		    T8a = VADD(T84, T85);
	       }
	       {
		    V T87, T8c, T88, T8b;
		    T87 = VADD(T81, T86);
		    ST(&(xo[WS(os, 24)]), T87, ovs, &(xo[0]));
		    T8c = VSUB(T8a, T89);
		    ST(&(xo[WS(os, 56)]), T8c, ovs, &(xo[0]));
		    T88 = VSUB(T86, T81);
		    ST(&(xo[WS(os, 40)]), T88, ovs, &(xo[0]));
		    T8b = VADD(T89, T8a);
		    ST(&(xo[WS(os, 8)]), T8b, ovs, &(xo[0]));
	       }
	  }
	  {
	       V T7H, T7N, T7K, T7O;
	       {
		    V T7F, T7G, T7I, T7J;
		    T7F = VADD(T7s, T7t);
		    T7G = VADD(T75, T7c);
		    T7H = VADD(T7F, T7G);
		    T7N = VSUB(T7F, T7G);
		    T7I = VADD(T7n, T7k);
		    T7J = VADD(T7v, T7w);
		    T7K = VBYI(VADD(T7I, T7J));
		    T7O = VBYI(VSUB(T7J, T7I));
	       }
	       {
		    V T7L, T7Q, T7M, T7P;
		    T7L = VSUB(T7H, T7K);
		    ST(&(xo[WS(os, 60)]), T7L, ovs, &(xo[0]));
		    T7Q = VADD(T7N, T7O);
		    ST(&(xo[WS(os, 28)]), T7Q, ovs, &(xo[0]));
		    T7M = VADD(T7H, T7K);
		    ST(&(xo[WS(os, 4)]), T7M, ovs, &(xo[0]));
		    T7P = VSUB(T7N, T7O);
		    ST(&(xo[WS(os, 36)]), T7P, ovs, &(xo[0]));
	       }
	  }
	  {
	       V T8f, T8l, T8i, T8m;
	       {
		    V T8d, T8e, T8g, T8h;
		    T8d = VADD(T82, T83);
		    T8e = VADD(T7Y, T7Z);
		    T8f = VSUB(T8d, T8e);
		    T8l = VADD(T8d, T8e);
		    T8g = VADD(T7R, T7S);
		    T8h = VADD(T7U, T7V);
		    T8i = VBYI(VSUB(T8g, T8h));
		    T8m = VADD(T8g, T8h);
	       }
	       {
		    V T8j, T8o, T8k, T8n;
		    T8j = VSUB(T8f, T8i);
		    ST(&(xo[WS(os, 48)]), T8j, ovs, &(xo[0]));
		    T8o = VADD(T8l, T8m);
		    ST(&(xo[0]), T8o, ovs, &(xo[0]));
		    T8k = VADD(T8f, T8i);
		    ST(&(xo[WS(os, 16)]), T8k, ovs, &(xo[0]));
		    T8n = VSUB(T8l, T8m);
		    ST(&(xo[WS(os, 32)]), T8n, ovs, &(xo[0]));
	       }
	  }
	  {
	       V T7p, T7B, T7y, T7C;
	       {
		    V T7d, T7o, T7u, T7x;
		    T7d = VSUB(T75, T7c);
		    T7o = VSUB(T7k, T7n);
		    T7p = VBYI(VSUB(T7d, T7o));
		    T7B = VBYI(VADD(T7o, T7d));
		    T7u = VSUB(T7s, T7t);
		    T7x = VSUB(T7v, T7w);
		    T7y = VSUB(T7u, T7x);
		    T7C = VADD(T7u, T7x);
	       }
	       {
		    V T7z, T7E, T7A, T7D;
		    T7z = VADD(T7p, T7y);
		    ST(&(xo[WS(os, 20)]), T7z, ovs, &(xo[0]));
		    T7E = VSUB(T7C, T7B);
		    ST(&(xo[WS(os, 52)]), T7E, ovs, &(xo[0]));
		    T7A = VSUB(T7y, T7p);
		    ST(&(xo[WS(os, 44)]), T7A, ovs, &(xo[0]));
		    T7D = VADD(T7B, T7C);
		    ST(&(xo[WS(os, 12)]), T7D, ovs, &(xo[0]));
	       }
	  }
	  {
	       V T5T, T6l, T69, T6f, T66, T6m, T6a, T6i;
	       {
		    V T5j, T6e, T5S, T6d, T5A, T5R;
		    T5j = VSUB(T53, T5i);
		    T6e = VADD(T61, T5W);
		    T5A = VFMA(LDK(KP831469612), T5u, VMUL(LDK(KP555570233), T5z));
		    T5R = VFNMS(LDK(KP555570233), T5Q, VMUL(LDK(KP831469612), T5L));
		    T5S = VSUB(T5A, T5R);
		    T6d = VADD(T5A, T5R);
		    T5T = VADD(T5j, T5S);
		    T6l = VBYI(VADD(T6e, T6d));
		    T69 = VSUB(T5j, T5S);
		    T6f = VBYI(VSUB(T6d, T6e));
	       }
	       {
		    V T62, T6g, T65, T6h, T63, T64;
		    T62 = VSUB(T5W, T61);
		    T6g = VADD(T53, T5i);
		    T63 = VFNMS(LDK(KP555570233), T5u, VMUL(LDK(KP831469612), T5z));
		    T64 = VFMA(LDK(KP555570233), T5L, VMUL(LDK(KP831469612), T5Q));
		    T65 = VSUB(T63, T64);
		    T6h = VADD(T63, T64);
		    T66 = VBYI(VADD(T62, T65));
		    T6m = VADD(T6g, T6h);
		    T6a = VBYI(VSUB(T65, T62));
		    T6i = VSUB(T6g, T6h);
	       }
	       {
		    V T67, T6n, T6o, T68;
		    T67 = VSUB(T5T, T66);
		    ST(&(xo[WS(os, 54)]), T67, ovs, &(xo[0]));
		    T6n = VADD(T6l, T6m);
		    ST(&(xo[WS(os, 6)]), T6n, ovs, &(xo[0]));
		    T6o = VSUB(T6m, T6l);
		    ST(&(xo[WS(os, 58)]), T6o, ovs, &(xo[0]));
		    T68 = VADD(T5T, T66);
		    ST(&(xo[WS(os, 10)]), T68, ovs, &(xo[0]));
	       }
	       {
		    V T6b, T6j, T6k, T6c;
		    T6b = VSUB(T69, T6a);
		    ST(&(xo[WS(os, 42)]), T6b, ovs, &(xo[0]));
		    T6j = VADD(T6f, T6i);
		    ST(&(xo[WS(os, 26)]), T6j, ovs, &(xo[0]));
		    T6k = VSUB(T6i, T6f);
		    ST(&(xo[WS(os, 38)]), T6k, ovs, &(xo[0]));
		    T6c = VADD(T69, T6a);
		    ST(&(xo[WS(os, 22)]), T6c, ovs, &(xo[0]));
	       }
	  }
	  {
	       V T6z, T6V, T6J, T6P, T6G, T6W, T6K, T6S;
	       {
		    V T6r, T6O, T6y, T6N, T6u, T6x;
		    T6r = VSUB(T6p, T6q);
		    T6O = VADD(T6B, T6A);
		    T6u = VFMA(LDK(KP980785280), T6s, VMUL(LDK(KP195090322), T6t));
		    T6x = VFNMS(LDK(KP195090322), T6w, VMUL(LDK(KP980785280), T6v));
		    T6y = VSUB(T6u, T6x);
		    T6N = VADD(T6u, T6x);
		    T6z = VADD(T6r, T6y);
		    T6V = VBYI(VADD(T6O, T6N));
		    T6J = VSUB(T6r, T6y);
		    T6P = VBYI(VSUB(T6N, T6O));
	       }
	       {
		    V T6C, T6Q, T6F, T6R, T6D, T6E;
		    T6C = VSUB(T6A, T6B);
		    T6Q = VADD(T6p, T6q);
		    T6D = VFNMS(LDK(KP195090322), T6s, VMUL(LDK(KP980785280), T6t));
		    T6E = VFMA(LDK(KP195090322), T6v, VMUL(LDK(KP980785280), T6w));
		    T6F = VSUB(T6D, T6E);
		    T6R = VADD(T6D, T6E);
		    T6G = VBYI(VADD(T6C, T6F));
		    T6W = VADD(T6Q, T6R);
		    T6K = VBYI(VSUB(T6F, T6C));
		    T6S = VSUB(T6Q, T6R);
	       }
	       {
		    V T6H, T6X, T6Y, T6I;
		    T6H = VSUB(T6z, T6G);
		    ST(&(xo[WS(os, 50)]), T6H, ovs, &(xo[0]));
		    T6X = VADD(T6V, T6W);
		    ST(&(xo[WS(os, 2)]), T6X, ovs, &(xo[0]));
		    T6Y = VSUB(T6W, T6V);
		    ST(&(xo[WS(os, 62)]), T6Y, ovs, &(xo[0]));
		    T6I = VADD(T6z, T6G);
		    ST(&(xo[WS(os, 14)]), T6I, ovs, &(xo[0]));
	       }
	       {
		    V T6L, T6T, T6U, T6M;
		    T6L = VSUB(T6J, T6K);
		    ST(&(xo[WS(os, 46)]), T6L, ovs, &(xo[0]));
		    T6T = VADD(T6P, T6S);
		    ST(&(xo[WS(os, 30)]), T6T, ovs, &(xo[0]));
		    T6U = VSUB(T6S, T6P);
		    ST(&(xo[WS(os, 34)]), T6U, ovs, &(xo[0]));
		    T6M = VADD(T6J, T6K);
		    ST(&(xo[WS(os, 18)]), T6M, ovs, &(xo[0]));
	       }
	  }
	  {
	       V T3P, T4f, T4g, T40, T43, T49, T4c, T44;
	       {
		    V T3O, T4a, T3W, T47, T3D, T48, T3Z, T4b, T3K, T3V;
		    T3K = VADD(T3G, T3J);
		    T3O = VSUB(T3K, T3N);
		    T4a = VADD(T3N, T3K);
		    T3V = VADD(T3T, T3U);
		    T3W = VSUB(T3S, T3V);
		    T47 = VADD(T3S, T3V);
		    {
			 V T3v, T3C, T3X, T3Y;
			 T3v = VFNMS(LDK(KP098017140), T3u, VMUL(LDK(KP995184726), T3r));
			 T3C = VFMA(LDK(KP995184726), T3y, VMUL(LDK(KP098017140), T3B));
			 T3D = VSUB(T3v, T3C);
			 T48 = VADD(T3v, T3C);
			 T3X = VFMA(LDK(KP098017140), T3r, VMUL(LDK(KP995184726), T3u));
			 T3Y = VFNMS(LDK(KP098017140), T3y, VMUL(LDK(KP995184726), T3B));
			 T3Z = VSUB(T3X, T3Y);
			 T4b = VADD(T3X, T3Y);
		    }
		    T3P = VBYI(VSUB(T3D, T3O));
		    T4f = VSUB(T47, T48);
		    T4g = VBYI(VSUB(T4b, T4a));
		    T40 = VSUB(T3W, T3Z);
		    T43 = VBYI(VADD(T3O, T3D));
		    T49 = VADD(T47, T48);
		    T4c = VBYI(VADD(T4a, T4b));
		    T44 = VADD(T3W, T3Z);
	       }
	       {
		    V T41, T4h, T4i, T42;
		    T41 = VADD(T3P, T40);
		    ST(&(xo[WS(os, 17)]), T41, ovs, &(xo[WS(os, 1)]));
		    T4h = VSUB(T4f, T4g);
		    ST(&(xo[WS(os, 33)]), T4h, ovs, &(xo[WS(os, 1)]));
		    T4i = VADD(T4f, T4g);
		    ST(&(xo[WS(os, 31)]), T4i, ovs, &(xo[WS(os, 1)]));
		    T42 = VSUB(T40, T3P);
		    ST(&(xo[WS(os, 47)]), T42, ovs, &(xo[WS(os, 1)]));
	       }
	       {
		    V T45, T4d, T4e, T46;
		    T45 = VADD(T43, T44);
		    ST(&(xo[WS(os, 15)]), T45, ovs, &(xo[WS(os, 1)]));
		    T4d = VSUB(T49, T4c);
		    ST(&(xo[WS(os, 63)]), T4d, ovs, &(xo[WS(os, 1)]));
		    T4e = VADD(T49, T4c);
		    ST(&(xo[WS(os, 1)]), T4e, ovs, &(xo[WS(os, 1)]));
		    T46 = VSUB(T44, T43);
		    ST(&(xo[WS(os, 49)]), T46, ovs, &(xo[WS(os, 1)]));
	       }
	  }
	  {
	       V T2f, T2L, T2M, T2w, T2z, T2F, T2I, T2A;
	       {
		    V T11, T2G, T2s, T2E, T2e, T2D, T2v, T2H, T10, T2i;
		    T10 = VSUB(TI, TZ);
		    T11 = VSUB(Tr, T10);
		    T2G = VADD(Tr, T10);
		    T2i = VSUB(T2g, T2h);
		    T2s = VSUB(T2i, T2r);
		    T2E = VADD(T2r, T2i);
		    {
			 V T1C, T2d, T2t, T2u;
			 T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
			 T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
			 T2e = VSUB(T1C, T2d);
			 T2D = VADD(T1C, T2d);
			 T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
			 T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
			 T2v = VSUB(T2t, T2u);
			 T2H = VADD(T2t, T2u);
		    }
		    T2f = VADD(T11, T2e);
		    T2L = VBYI(VADD(T2E, T2D));
		    T2M = VADD(T2G, T2H);
		    T2w = VBYI(VADD(T2s, T2v));
		    T2z = VSUB(T11, T2e);
		    T2F = VBYI(VSUB(T2D, T2E));
		    T2I = VSUB(T2G, T2H);
		    T2A = VBYI(VSUB(T2v, T2s));
	       }
	       {
		    V T2x, T2N, T2O, T2y;
		    T2x = VSUB(T2f, T2w);
		    ST(&(xo[WS(os, 53)]), T2x, ovs, &(xo[WS(os, 1)]));
		    T2N = VADD(T2L, T2M);
		    ST(&(xo[WS(os, 5)]), T2N, ovs, &(xo[WS(os, 1)]));
		    T2O = VSUB(T2M, T2L);
		    ST(&(xo[WS(os, 59)]), T2O, ovs, &(xo[WS(os, 1)]));
		    T2y = VADD(T2f, T2w);
		    ST(&(xo[WS(os, 11)]), T2y, ovs, &(xo[WS(os, 1)]));
	       }
	       {
		    V T2B, T2J, T2K, T2C;
		    T2B = VSUB(T2z, T2A);
		    ST(&(xo[WS(os, 43)]), T2B, ovs, &(xo[WS(os, 1)]));
		    T2J = VADD(T2F, T2I);
		    ST(&(xo[WS(os, 27)]), T2J, ovs, &(xo[WS(os, 1)]));
		    T2K = VSUB(T2I, T2F);
		    ST(&(xo[WS(os, 37)]), T2K, ovs, &(xo[WS(os, 1)]));
		    T2C = VADD(T2z, T2A);
		    ST(&(xo[WS(os, 21)]), T2C, ovs, &(xo[WS(os, 1)]));
	       }
	  }
	  {
	       V T4t, T4P, T4Q, T4A, T4D, T4J, T4M, T4E;
	       {
		    V T4s, T4K, T4w, T4H, T4p, T4I, T4z, T4L, T4q, T4v;
		    T4q = VSUB(T3T, T3U);
		    T4s = VSUB(T4q, T4r);
		    T4K = VADD(T4r, T4q);
		    T4v = VSUB(T3G, T3J);
		    T4w = VSUB(T4u, T4v);
		    T4H = VADD(T4u, T4v);
		    {
			 V T4l, T4o, T4x, T4y;
			 T4l = VFNMS(LDK(KP634393284), T4k, VMUL(LDK(KP773010453), T4j));
			 T4o = VFMA(LDK(KP773010453), T4m, VMUL(LDK(KP634393284), T4n));
			 T4p = VSUB(T4l, T4o);
			 T4I = VADD(T4l, T4o);
			 T4x = VFMA(LDK(KP634393284), T4j, VMUL(LDK(KP773010453), T4k));
			 T4y = VFNMS(LDK(KP634393284), T4m, VMUL(LDK(KP773010453), T4n));
			 T4z = VSUB(T4x, T4y);
			 T4L = VADD(T4x, T4y);
		    }
		    T4t = VBYI(VSUB(T4p, T4s));
		    T4P = VSUB(T4H, T4I);
		    T4Q = VBYI(VSUB(T4L, T4K));
		    T4A = VSUB(T4w, T4z);
		    T4D = VBYI(VADD(T4s, T4p));
		    T4J = VADD(T4H, T4I);
		    T4M = VBYI(VADD(T4K, T4L));
		    T4E = VADD(T4w, T4z);
	       }
	       {
		    V T4B, T4R, T4S, T4C;
		    T4B = VADD(T4t, T4A);
		    ST(&(xo[WS(os, 23)]), T4B, ovs, &(xo[WS(os, 1)]));
		    T4R = VSUB(T4P, T4Q);
		    ST(&(xo[WS(os, 39)]), T4R, ovs, &(xo[WS(os, 1)]));
		    T4S = VADD(T4P, T4Q);
		    ST(&(xo[WS(os, 25)]), T4S, ovs, &(xo[WS(os, 1)]));
		    T4C = VSUB(T4A, T4t);
		    ST(&(xo[WS(os, 41)]), T4C, ovs, &(xo[WS(os, 1)]));
	       }
	       {
		    V T4F, T4N, T4O, T4G;
		    T4F = VADD(T4D, T4E);
		    ST(&(xo[WS(os, 9)]), T4F, ovs, &(xo[WS(os, 1)]));
		    T4N = VSUB(T4J, T4M);
		    ST(&(xo[WS(os, 57)]), T4N, ovs, &(xo[WS(os, 1)]));
		    T4O = VADD(T4J, T4M);
		    ST(&(xo[WS(os, 7)]), T4O, ovs, &(xo[WS(os, 1)]));
		    T4G = VSUB(T4E, T4D);
		    ST(&(xo[WS(os, 55)]), T4G, ovs, &(xo[WS(os, 1)]));
	       }
	  }
	  {
	       V T2Z, T3l, T3m, T36, T39, T3f, T3i, T3a;
	       {
		    V T2R, T3g, T32, T3e, T2Y, T3d, T35, T3h, T2Q, T30;
		    T2Q = VADD(T2g, T2h);
		    T2R = VSUB(T2P, T2Q);
		    T3g = VADD(T2P, T2Q);
		    T30 = VADD(TI, TZ);
		    T32 = VSUB(T30, T31);
		    T3e = VADD(T31, T30);
		    {
			 V T2U, T2X, T33, T34;
			 T2U = VFMA(LDK(KP956940335), T2S, VMUL(LDK(KP290284677), T2T));
			 T2X = VFNMS(LDK(KP290284677), T2W, VMUL(LDK(KP956940335), T2V));
			 T2Y = VSUB(T2U, T2X);
			 T3d = VADD(T2U, T2X);
			 T33 = VFNMS(LDK(KP290284677), T2S, VMUL(LDK(KP956940335), T2T));
			 T34 = VFMA(LDK(KP290284677), T2V, VMUL(LDK(KP956940335), T2W));
			 T35 = VSUB(T33, T34);
			 T3h = VADD(T33, T34);
		    }
		    T2Z = VADD(T2R, T2Y);
		    T3l = VBYI(VADD(T3e, T3d));
		    T3m = VADD(T3g, T3h);
		    T36 = VBYI(VADD(T32, T35));
		    T39 = VSUB(T2R, T2Y);
		    T3f = VBYI(VSUB(T3d, T3e));
		    T3i = VSUB(T3g, T3h);
		    T3a = VBYI(VSUB(T35, T32));
	       }
	       {
		    V T37, T3n, T3o, T38;
		    T37 = VSUB(T2Z, T36);
		    ST(&(xo[WS(os, 51)]), T37, ovs, &(xo[WS(os, 1)]));
		    T3n = VADD(T3l, T3m);
		    ST(&(xo[WS(os, 3)]), T3n, ovs, &(xo[WS(os, 1)]));
		    T3o = VSUB(T3m, T3l);
		    ST(&(xo[WS(os, 61)]), T3o, ovs, &(xo[WS(os, 1)]));
		    T38 = VADD(T2Z, T36);
		    ST(&(xo[WS(os, 13)]), T38, ovs, &(xo[WS(os, 1)]));
	       }
	       {
		    V T3b, T3j, T3k, T3c;
		    T3b = VSUB(T39, T3a);
		    ST(&(xo[WS(os, 45)]), T3b, ovs, &(xo[WS(os, 1)]));
		    T3j = VADD(T3f, T3i);
		    ST(&(xo[WS(os, 29)]), T3j, ovs, &(xo[WS(os, 1)]));
		    T3k = VSUB(T3i, T3f);
		    ST(&(xo[WS(os, 35)]), T3k, ovs, &(xo[WS(os, 1)]));
		    T3c = VADD(T39, T3a);
		    ST(&(xo[WS(os, 19)]), T3c, ovs, &(xo[WS(os, 1)]));
	       }
	  }
     }
}

static void m1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     int i;
     BEGIN_SIMD();
     for (i = 0; i < v; i += VL) {
	  m1bv_64_0(ii, io, is, os, ivs, ovs);
	  ii += VL * ivs;
	  io += VL * ovs;
     }
     END_SIMD();
}

static const kdft_desc desc = { 64, "m1bv_64", {404, 72, 52, 0}, &GENUS, 0, 0, 0, 0 };
void X(codelet_m1bv_64) (planner *p) {
     X(kdft_register) (p, m1bv_64, &desc);
}
