/*
 * Copyright (c) 2003 Matteo Frigo
 * Copyright (c) 2003 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Sat Apr 19 18:05:08 EDT 2003 */

#include "codelet-dft.h"

/* Generated by: /homea/athena/fftw3/genfft/gen_twiddle_c -simd -trivial-stores -compact -variables 4 -n 64 -name t1fv_64 -include t1f.h */

/*
 * This function contains 519 FP additions, 250 FP multiplications,
 * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
 * 111 stack variables, and 128 memory accesses
 */
/*
 * Generator Id's : 
 * $Id: algsimp.ml,v 1.7 2003/03/15 20:29:42 stevenj Exp $
 * $Id: fft.ml,v 1.2 2003/03/15 20:29:42 stevenj Exp $
 * $Id: gen_twiddle_c.ml,v 1.7 2003/04/16 19:51:27 athena Exp $
 */

#include "t1f.h"

static const R *t1fv_64(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
     int i;
     R *x;
     x = ri;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 126)) {
	  V T5W, T6X, Tg, T4q, T3o, T4R, T82, T8O, TD, T4S, T85, T91, T3h, T4r, T63;
	  V T6Y, T6i, T71, T8p, T8Q, T13, T4V, T3q, T4y, T6b, T70, T8s, T8P, T1s, T4U;
	  V T3r, T4v, T8g, T8W, T8j, T8X, T6M, T7x, T6R, T7w, T30, T5q, T5r, T3b, T3W;
	  V T4K, T4N, T3X, T89, T8T, T8c, T8U, T6v, T7u, T6A, T7t, T29, T5n, T5o, T2k;
	  V T3T, T4D, T4G, T3U;
	  {
	       V T1, T3, T5U, T3k, T3m, T5V, T9, T6W, Te, T6V, T2, T3j, T3l, T4, Tf;
	       T1 = LD(&(x[0]), dist, &(x[0]));
	       T2 = LD(&(x[WS(ios, 32)]), dist, &(x[0]));
	       T3 = BYTWJ(&(W[TWVL * 62]), T2);
	       T5U = VADD(T1, T3);
	       T3j = LD(&(x[WS(ios, 16)]), dist, &(x[0]));
	       T3k = BYTWJ(&(W[TWVL * 30]), T3j);
	       T3l = LD(&(x[WS(ios, 48)]), dist, &(x[0]));
	       T3m = BYTWJ(&(W[TWVL * 94]), T3l);
	       T5V = VADD(T3k, T3m);
	       {
		    V T6, T8, T5, T7;
		    T5 = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
		    T6 = BYTWJ(&(W[TWVL * 14]), T5);
		    T7 = LD(&(x[WS(ios, 40)]), dist, &(x[0]));
		    T8 = BYTWJ(&(W[TWVL * 78]), T7);
		    T9 = VSUB(T6, T8);
		    T6W = VADD(T6, T8);
	       }
	       {
		    V Tb, Td, Ta, Tc;
		    Ta = LD(&(x[WS(ios, 56)]), dist, &(x[0]));
		    Tb = BYTWJ(&(W[TWVL * 110]), Ta);
		    Tc = LD(&(x[WS(ios, 24)]), dist, &(x[0]));
		    Td = BYTWJ(&(W[TWVL * 46]), Tc);
		    Te = VSUB(Tb, Td);
		    T6V = VADD(Tb, Td);
	       }
	       T5W = VSUB(T5U, T5V);
	       T6X = VSUB(T6V, T6W);
	       T4 = VSUB(T1, T3);
	       Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
	       Tg = VADD(T4, Tf);
	       T4q = VSUB(T4, Tf);
	       {
		    V T3i, T3n, T80, T81;
		    T3i = VMUL(LDK(KP707106781), VSUB(Te, T9));
		    T3n = VSUB(T3k, T3m);
		    T3o = VSUB(T3i, T3n);
		    T4R = VADD(T3n, T3i);
		    T80 = VADD(T5U, T5V);
		    T81 = VADD(T6W, T6V);
		    T82 = VADD(T80, T81);
		    T8O = VSUB(T80, T81);
	       }
	  }
	  {
	       V Tl, T5X, TB, T61, Tq, T5Y, Tw, T60;
	       {
		    V Ti, Tk, Th, Tj;
		    Th = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
		    Ti = BYTWJ(&(W[TWVL * 6]), Th);
		    Tj = LD(&(x[WS(ios, 36)]), dist, &(x[0]));
		    Tk = BYTWJ(&(W[TWVL * 70]), Tj);
		    Tl = VSUB(Ti, Tk);
		    T5X = VADD(Ti, Tk);
	       }
	       {
		    V Ty, TA, Tx, Tz;
		    Tx = LD(&(x[WS(ios, 12)]), dist, &(x[0]));
		    Ty = BYTWJ(&(W[TWVL * 22]), Tx);
		    Tz = LD(&(x[WS(ios, 44)]), dist, &(x[0]));
		    TA = BYTWJ(&(W[TWVL * 86]), Tz);
		    TB = VSUB(Ty, TA);
		    T61 = VADD(Ty, TA);
	       }
	       {
		    V Tn, Tp, Tm, To;
		    Tm = LD(&(x[WS(ios, 20)]), dist, &(x[0]));
		    Tn = BYTWJ(&(W[TWVL * 38]), Tm);
		    To = LD(&(x[WS(ios, 52)]), dist, &(x[0]));
		    Tp = BYTWJ(&(W[TWVL * 102]), To);
		    Tq = VSUB(Tn, Tp);
		    T5Y = VADD(Tn, Tp);
	       }
	       {
		    V Tt, Tv, Ts, Tu;
		    Ts = LD(&(x[WS(ios, 60)]), dist, &(x[0]));
		    Tt = BYTWJ(&(W[TWVL * 118]), Ts);
		    Tu = LD(&(x[WS(ios, 28)]), dist, &(x[0]));
		    Tv = BYTWJ(&(W[TWVL * 54]), Tu);
		    Tw = VSUB(Tt, Tv);
		    T60 = VADD(Tt, Tv);
	       }
	       {
		    V Tr, TC, T83, T84;
		    Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
		    TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
		    TD = VADD(Tr, TC);
		    T4S = VSUB(TC, Tr);
		    T83 = VADD(T5X, T5Y);
		    T84 = VADD(T60, T61);
		    T85 = VADD(T83, T84);
		    T91 = VSUB(T84, T83);
	       }
	       {
		    V T3f, T3g, T5Z, T62;
		    T3f = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
		    T3g = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
		    T3h = VSUB(T3f, T3g);
		    T4r = VADD(T3g, T3f);
		    T5Z = VSUB(T5X, T5Y);
		    T62 = VSUB(T60, T61);
		    T63 = VMUL(LDK(KP707106781), VADD(T5Z, T62));
		    T6Y = VMUL(LDK(KP707106781), VSUB(T62, T5Z));
	       }
	  }
	  {
	       V TJ, T6c, T11, T6d, TO, T6g, TT, T6f;
	       {
		    V TG, TI, TF, TH;
		    TF = LD(&(x[WS(ios, 62)]), dist, &(x[0]));
		    TG = BYTWJ(&(W[TWVL * 122]), TF);
		    TH = LD(&(x[WS(ios, 30)]), dist, &(x[0]));
		    TI = BYTWJ(&(W[TWVL * 58]), TH);
		    TJ = VSUB(TG, TI);
		    T6c = VADD(TG, TI);
	       }
	       {
		    V TY, T10, TX, TZ;
		    TX = LD(&(x[WS(ios, 14)]), dist, &(x[0]));
		    TY = BYTWJ(&(W[TWVL * 26]), TX);
		    TZ = LD(&(x[WS(ios, 46)]), dist, &(x[0]));
		    T10 = BYTWJ(&(W[TWVL * 90]), TZ);
		    T11 = VSUB(TY, T10);
		    T6d = VADD(TY, T10);
	       }
	       {
		    V TL, TN, TK, TM;
		    TK = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
		    TL = BYTWJ(&(W[TWVL * 10]), TK);
		    TM = LD(&(x[WS(ios, 38)]), dist, &(x[0]));
		    TN = BYTWJ(&(W[TWVL * 74]), TM);
		    TO = VSUB(TL, TN);
		    T6g = VADD(TL, TN);
	       }
	       {
		    V TQ, TS, TP, TR;
		    TP = LD(&(x[WS(ios, 54)]), dist, &(x[0]));
		    TQ = BYTWJ(&(W[TWVL * 106]), TP);
		    TR = LD(&(x[WS(ios, 22)]), dist, &(x[0]));
		    TS = BYTWJ(&(W[TWVL * 42]), TR);
		    TT = VSUB(TQ, TS);
		    T6f = VADD(TQ, TS);
	       }
	       {
		    V T6e, T6h, T8n, T8o;
		    T6e = VSUB(T6c, T6d);
		    T6h = VSUB(T6f, T6g);
		    T6i = VFNMS(LDK(KP382683432), T6h, VMUL(LDK(KP923879532), T6e));
		    T71 = VFMA(LDK(KP382683432), T6e, VMUL(LDK(KP923879532), T6h));
		    T8n = VADD(T6c, T6d);
		    T8o = VADD(T6g, T6f);
		    T8p = VADD(T8n, T8o);
		    T8Q = VSUB(T8n, T8o);
		    {
			 V TV, T4w, T12, T4x, TU, TW;
			 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
			 TV = VADD(TJ, TU);
			 T4w = VSUB(TJ, TU);
			 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
			 T12 = VSUB(TW, T11);
			 T4x = VADD(T11, TW);
			 T13 = VFMA(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
			 T4V = VFMA(LDK(KP555570233), T4w, VMUL(LDK(KP831469612), T4x));
			 T3q = VFNMS(LDK(KP195090322), T12, VMUL(LDK(KP980785280), TV));
			 T4y = VFNMS(LDK(KP555570233), T4x, VMUL(LDK(KP831469612), T4w));
		    }
	       }
	  }
	  {
	       V T18, T68, T1p, T65, T1d, T69, T1j, T66;
	       {
		    V T15, T17, T14, T16;
		    T14 = LD(&(x[WS(ios, 58)]), dist, &(x[0]));
		    T15 = BYTWJ(&(W[TWVL * 114]), T14);
		    T16 = LD(&(x[WS(ios, 26)]), dist, &(x[0]));
		    T17 = BYTWJ(&(W[TWVL * 50]), T16);
		    T18 = VSUB(T15, T17);
		    T68 = VADD(T15, T17);
	       }
	       {
		    V T1m, T1o, T1l, T1n;
		    T1l = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
		    T1m = BYTWJ(&(W[TWVL * 2]), T1l);
		    T1n = LD(&(x[WS(ios, 34)]), dist, &(x[0]));
		    T1o = BYTWJ(&(W[TWVL * 66]), T1n);
		    T1p = VSUB(T1m, T1o);
		    T65 = VADD(T1m, T1o);
	       }
	       {
		    V T1a, T1c, T19, T1b;
		    T19 = LD(&(x[WS(ios, 10)]), dist, &(x[0]));
		    T1a = BYTWJ(&(W[TWVL * 18]), T19);
		    T1b = LD(&(x[WS(ios, 42)]), dist, &(x[0]));
		    T1c = BYTWJ(&(W[TWVL * 82]), T1b);
		    T1d = VSUB(T1a, T1c);
		    T69 = VADD(T1a, T1c);
	       }
	       {
		    V T1g, T1i, T1f, T1h;
		    T1f = LD(&(x[WS(ios, 18)]), dist, &(x[0]));
		    T1g = BYTWJ(&(W[TWVL * 34]), T1f);
		    T1h = LD(&(x[WS(ios, 50)]), dist, &(x[0]));
		    T1i = BYTWJ(&(W[TWVL * 98]), T1h);
		    T1j = VSUB(T1g, T1i);
		    T66 = VADD(T1g, T1i);
	       }
	       {
		    V T67, T6a, T8q, T8r;
		    T67 = VSUB(T65, T66);
		    T6a = VSUB(T68, T69);
		    T6b = VFMA(LDK(KP923879532), T67, VMUL(LDK(KP382683432), T6a));
		    T70 = VFNMS(LDK(KP382683432), T67, VMUL(LDK(KP923879532), T6a));
		    T8q = VADD(T65, T66);
		    T8r = VADD(T69, T68);
		    T8s = VADD(T8q, T8r);
		    T8P = VSUB(T8q, T8r);
		    {
			 V T1k, T4u, T1r, T4t, T1e, T1q;
			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
			 T1k = VSUB(T1e, T1j);
			 T4u = VADD(T1j, T1e);
			 T1q = VMUL(LDK(KP707106781), VADD(T1d, T18));
			 T1r = VADD(T1p, T1q);
			 T4t = VSUB(T1p, T1q);
			 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
			 T4U = VFNMS(LDK(KP555570233), T4t, VMUL(LDK(KP831469612), T4u));
			 T3r = VFMA(LDK(KP980785280), T1r, VMUL(LDK(KP195090322), T1k));
			 T4v = VFMA(LDK(KP831469612), T4t, VMUL(LDK(KP555570233), T4u));
		    }
	       }
	  }
	  {
	       V T2q, T6C, T2X, T6J, T39, T6D, T2v, T6O, T2A, T6N, T2M, T6G, T2S, T6I, T2H;
	       V T6F;
	       {
		    V T2n, T2p, T2m, T2o;
		    T2m = LD(&(x[WS(ios, 63)]), dist, &(x[WS(ios, 1)]));
		    T2n = BYTWJ(&(W[TWVL * 124]), T2m);
		    T2o = LD(&(x[WS(ios, 31)]), dist, &(x[WS(ios, 1)]));
		    T2p = BYTWJ(&(W[TWVL * 60]), T2o);
		    T2q = VSUB(T2n, T2p);
		    T6C = VADD(T2n, T2p);
	       }
	       {
		    V T2U, T2W, T2T, T2V;
		    T2T = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)]));
		    T2U = BYTWJ(&(W[TWVL * 20]), T2T);
		    T2V = LD(&(x[WS(ios, 43)]), dist, &(x[WS(ios, 1)]));
		    T2W = BYTWJ(&(W[TWVL * 84]), T2V);
		    T2X = VSUB(T2U, T2W);
		    T6J = VADD(T2U, T2W);
	       }
	       {
		    V T36, T38, T35, T37;
		    T35 = LD(&(x[WS(ios, 15)]), dist, &(x[WS(ios, 1)]));
		    T36 = BYTWJ(&(W[TWVL * 28]), T35);
		    T37 = LD(&(x[WS(ios, 47)]), dist, &(x[WS(ios, 1)]));
		    T38 = BYTWJ(&(W[TWVL * 92]), T37);
		    T39 = VSUB(T36, T38);
		    T6D = VADD(T36, T38);
	       }
	       {
		    V T2s, T2u, T2r, T2t;
		    T2r = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
		    T2s = BYTWJ(&(W[TWVL * 12]), T2r);
		    T2t = LD(&(x[WS(ios, 39)]), dist, &(x[WS(ios, 1)]));
		    T2u = BYTWJ(&(W[TWVL * 76]), T2t);
		    T2v = VSUB(T2s, T2u);
		    T6O = VADD(T2s, T2u);
	       }
	       {
		    V T2x, T2z, T2w, T2y;
		    T2w = LD(&(x[WS(ios, 55)]), dist, &(x[WS(ios, 1)]));
		    T2x = BYTWJ(&(W[TWVL * 108]), T2w);
		    T2y = LD(&(x[WS(ios, 23)]), dist, &(x[WS(ios, 1)]));
		    T2z = BYTWJ(&(W[TWVL * 44]), T2y);
		    T2A = VSUB(T2x, T2z);
		    T6N = VADD(T2x, T2z);
	       }
	       {
		    V T2J, T2L, T2I, T2K;
		    T2I = LD(&(x[WS(ios, 19)]), dist, &(x[WS(ios, 1)]));
		    T2J = BYTWJ(&(W[TWVL * 36]), T2I);
		    T2K = LD(&(x[WS(ios, 51)]), dist, &(x[WS(ios, 1)]));
		    T2L = BYTWJ(&(W[TWVL * 100]), T2K);
		    T2M = VSUB(T2J, T2L);
		    T6G = VADD(T2J, T2L);
	       }
	       {
		    V T2P, T2R, T2O, T2Q;
		    T2O = LD(&(x[WS(ios, 59)]), dist, &(x[WS(ios, 1)]));
		    T2P = BYTWJ(&(W[TWVL * 116]), T2O);
		    T2Q = LD(&(x[WS(ios, 27)]), dist, &(x[WS(ios, 1)]));
		    T2R = BYTWJ(&(W[TWVL * 52]), T2Q);
		    T2S = VSUB(T2P, T2R);
		    T6I = VADD(T2P, T2R);
	       }
	       {
		    V T2E, T2G, T2D, T2F;
		    T2D = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
		    T2E = BYTWJ(&(W[TWVL * 4]), T2D);
		    T2F = LD(&(x[WS(ios, 35)]), dist, &(x[WS(ios, 1)]));
		    T2G = BYTWJ(&(W[TWVL * 68]), T2F);
		    T2H = VSUB(T2E, T2G);
		    T6F = VADD(T2E, T2G);
	       }
	       {
		    V T8e, T8f, T8h, T8i;
		    T8e = VADD(T6C, T6D);
		    T8f = VADD(T6O, T6N);
		    T8g = VADD(T8e, T8f);
		    T8W = VSUB(T8e, T8f);
		    T8h = VADD(T6F, T6G);
		    T8i = VADD(T6I, T6J);
		    T8j = VADD(T8h, T8i);
		    T8X = VSUB(T8i, T8h);
	       }
	       {
		    V T6E, T6P, T6L, T6Q, T6H, T6K;
		    T6E = VSUB(T6C, T6D);
		    T6P = VSUB(T6N, T6O);
		    T6H = VSUB(T6F, T6G);
		    T6K = VSUB(T6I, T6J);
		    T6L = VMUL(LDK(KP707106781), VADD(T6H, T6K));
		    T6Q = VMUL(LDK(KP707106781), VSUB(T6K, T6H));
		    T6M = VADD(T6E, T6L);
		    T7x = VSUB(T6Q, T6P);
		    T6R = VADD(T6P, T6Q);
		    T7w = VSUB(T6E, T6L);
	       }
	       {
		    V T2C, T4I, T3a, T4L, T2Z, T4M, T33, T4J, T2B, T34;
		    T2B = VMUL(LDK(KP707106781), VADD(T2v, T2A));
		    T2C = VADD(T2q, T2B);
		    T4I = VSUB(T2q, T2B);
		    T34 = VMUL(LDK(KP707106781), VSUB(T2A, T2v));
		    T3a = VSUB(T34, T39);
		    T4L = VADD(T39, T34);
		    {
			 V T2N, T2Y, T31, T32;
			 T2N = VFNMS(LDK(KP382683432), T2M, VMUL(LDK(KP923879532), T2H));
			 T2Y = VFMA(LDK(KP923879532), T2S, VMUL(LDK(KP382683432), T2X));
			 T2Z = VADD(T2N, T2Y);
			 T4M = VSUB(T2Y, T2N);
			 T31 = VFNMS(LDK(KP923879532), T2X, VMUL(LDK(KP382683432), T2S));
			 T32 = VFMA(LDK(KP382683432), T2H, VMUL(LDK(KP923879532), T2M));
			 T33 = VSUB(T31, T32);
			 T4J = VADD(T32, T31);
		    }
		    T30 = VSUB(T2C, T2Z);
		    T5q = VSUB(T4I, T4J);
		    T5r = VSUB(T4M, T4L);
		    T3b = VSUB(T33, T3a);
		    T3W = VADD(T2C, T2Z);
		    T4K = VADD(T4I, T4J);
		    T4N = VADD(T4L, T4M);
		    T3X = VADD(T3a, T33);
	       }
	  }
	  {
	       V T1z, T6r, T27, T6m, T2e, T6l, T1E, T6s, T1K, T6o, T1W, T6w, T21, T6x, T1P;
	       V T6p;
	       {
		    V T1w, T1y, T1v, T1x;
		    T1v = LD(&(x[WS(ios, 61)]), dist, &(x[WS(ios, 1)]));
		    T1w = BYTWJ(&(W[TWVL * 120]), T1v);
		    T1x = LD(&(x[WS(ios, 29)]), dist, &(x[WS(ios, 1)]));
		    T1y = BYTWJ(&(W[TWVL * 56]), T1x);
		    T1z = VSUB(T1w, T1y);
		    T6r = VADD(T1w, T1y);
	       }
	       {
		    V T24, T26, T23, T25;
		    T23 = LD(&(x[WS(ios, 17)]), dist, &(x[WS(ios, 1)]));
		    T24 = BYTWJ(&(W[TWVL * 32]), T23);
		    T25 = LD(&(x[WS(ios, 49)]), dist, &(x[WS(ios, 1)]));
		    T26 = BYTWJ(&(W[TWVL * 96]), T25);
		    T27 = VSUB(T24, T26);
		    T6m = VADD(T24, T26);
	       }
	       {
		    V T2b, T2d, T2a, T2c;
		    T2a = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
		    T2b = BYTWJ(&(W[0]), T2a);
		    T2c = LD(&(x[WS(ios, 33)]), dist, &(x[WS(ios, 1)]));
		    T2d = BYTWJ(&(W[TWVL * 64]), T2c);
		    T2e = VSUB(T2b, T2d);
		    T6l = VADD(T2b, T2d);
	       }
	       {
		    V T1B, T1D, T1A, T1C;
		    T1A = LD(&(x[WS(ios, 13)]), dist, &(x[WS(ios, 1)]));
		    T1B = BYTWJ(&(W[TWVL * 24]), T1A);
		    T1C = LD(&(x[WS(ios, 45)]), dist, &(x[WS(ios, 1)]));
		    T1D = BYTWJ(&(W[TWVL * 88]), T1C);
		    T1E = VSUB(T1B, T1D);
		    T6s = VADD(T1B, T1D);
	       }
	       {
		    V T1H, T1J, T1G, T1I;
		    T1G = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
		    T1H = BYTWJ(&(W[TWVL * 8]), T1G);
		    T1I = LD(&(x[WS(ios, 37)]), dist, &(x[WS(ios, 1)]));
		    T1J = BYTWJ(&(W[TWVL * 72]), T1I);
		    T1K = VSUB(T1H, T1J);
		    T6o = VADD(T1H, T1J);
	       }
	       {
		    V T1T, T1V, T1S, T1U;
		    T1S = LD(&(x[WS(ios, 57)]), dist, &(x[WS(ios, 1)]));
		    T1T = BYTWJ(&(W[TWVL * 112]), T1S);
		    T1U = LD(&(x[WS(ios, 25)]), dist, &(x[WS(ios, 1)]));
		    T1V = BYTWJ(&(W[TWVL * 48]), T1U);
		    T1W = VSUB(T1T, T1V);
		    T6w = VADD(T1T, T1V);
	       }
	       {
		    V T1Y, T20, T1X, T1Z;
		    T1X = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
		    T1Y = BYTWJ(&(W[TWVL * 16]), T1X);
		    T1Z = LD(&(x[WS(ios, 41)]), dist, &(x[WS(ios, 1)]));
		    T20 = BYTWJ(&(W[TWVL * 80]), T1Z);
		    T21 = VSUB(T1Y, T20);
		    T6x = VADD(T1Y, T20);
	       }
	       {
		    V T1M, T1O, T1L, T1N;
		    T1L = LD(&(x[WS(ios, 21)]), dist, &(x[WS(ios, 1)]));
		    T1M = BYTWJ(&(W[TWVL * 40]), T1L);
		    T1N = LD(&(x[WS(ios, 53)]), dist, &(x[WS(ios, 1)]));
		    T1O = BYTWJ(&(W[TWVL * 104]), T1N);
		    T1P = VSUB(T1M, T1O);
		    T6p = VADD(T1M, T1O);
	       }
	       {
		    V T87, T88, T8a, T8b;
		    T87 = VADD(T6l, T6m);
		    T88 = VADD(T6x, T6w);
		    T89 = VADD(T87, T88);
		    T8T = VSUB(T87, T88);
		    T8a = VADD(T6o, T6p);
		    T8b = VADD(T6r, T6s);
		    T8c = VADD(T8a, T8b);
		    T8U = VSUB(T8b, T8a);
	       }
	       {
		    V T6n, T6y, T6u, T6z, T6q, T6t;
		    T6n = VSUB(T6l, T6m);
		    T6y = VSUB(T6w, T6x);
		    T6q = VSUB(T6o, T6p);
		    T6t = VSUB(T6r, T6s);
		    T6u = VMUL(LDK(KP707106781), VADD(T6q, T6t));
		    T6z = VMUL(LDK(KP707106781), VSUB(T6t, T6q));
		    T6v = VADD(T6n, T6u);
		    T7u = VSUB(T6z, T6y);
		    T6A = VADD(T6y, T6z);
		    T7t = VSUB(T6n, T6u);
	       }
	       {
		    V T28, T4E, T2g, T4B, T1R, T4C, T2j, T4F, T22, T2f;
		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
		    T28 = VSUB(T22, T27);
		    T4E = VADD(T27, T22);
		    T2f = VMUL(LDK(KP707106781), VADD(T21, T1W));
		    T2g = VADD(T2e, T2f);
		    T4B = VSUB(T2e, T2f);
		    {
			 V T1F, T1Q, T2h, T2i;
			 T1F = VFNMS(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1z));
			 T1Q = VFMA(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
			 T1R = VSUB(T1F, T1Q);
			 T4C = VADD(T1Q, T1F);
			 T2h = VFNMS(LDK(KP382683432), T1P, VMUL(LDK(KP923879532), T1K));
			 T2i = VFMA(LDK(KP923879532), T1z, VMUL(LDK(KP382683432), T1E));
			 T2j = VADD(T2h, T2i);
			 T4F = VSUB(T2i, T2h);
		    }
		    T29 = VSUB(T1R, T28);
		    T5n = VSUB(T4B, T4C);
		    T5o = VSUB(T4F, T4E);
		    T2k = VSUB(T2g, T2j);
		    T3T = VADD(T28, T1R);
		    T4D = VADD(T4B, T4C);
		    T4G = VADD(T4E, T4F);
		    T3U = VADD(T2g, T2j);
	       }
	  }
	  {
	       V T8E, T8K, T8H, T8L;
	       {
		    V T8C, T8D, T8F, T8G;
		    T8C = VADD(T82, T85);
		    T8D = VADD(T8s, T8p);
		    T8E = VADD(T8C, T8D);
		    T8K = VSUB(T8C, T8D);
		    T8F = VADD(T89, T8c);
		    T8G = VADD(T8g, T8j);
		    T8H = VADD(T8F, T8G);
		    T8L = VBYI(VSUB(T8G, T8F));
	       }
	       {
		    V T8I, T8N, T8J, T8M;
		    T8I = VSUB(T8E, T8H);
		    ST(&(x[WS(ios, 32)]), T8I, dist, &(x[0]));
		    T8N = VADD(T8K, T8L);
		    ST(&(x[WS(ios, 16)]), T8N, dist, &(x[0]));
		    T8J = VADD(T8E, T8H);
		    ST(&(x[0]), T8J, dist, &(x[0]));
		    T8M = VSUB(T8K, T8L);
		    ST(&(x[WS(ios, 48)]), T8M, dist, &(x[0]));
	       }
	  }
	  {
	       V T8m, T8z, T8v, T8y;
	       {
		    V T86, T8t, T8l, T8u, T8d, T8k;
		    T86 = VSUB(T82, T85);
		    T8t = VSUB(T8p, T8s);
		    T8d = VSUB(T89, T8c);
		    T8k = VSUB(T8g, T8j);
		    T8l = VMUL(LDK(KP707106781), VADD(T8d, T8k));
		    T8u = VMUL(LDK(KP707106781), VSUB(T8k, T8d));
		    T8m = VADD(T86, T8l);
		    T8z = VBYI(VSUB(T8u, T8t));
		    T8v = VBYI(VADD(T8t, T8u));
		    T8y = VSUB(T86, T8l);
	       }
	       {
		    V T8w, T8B, T8x, T8A;
		    T8w = VSUB(T8m, T8v);
		    ST(&(x[WS(ios, 56)]), T8w, dist, &(x[0]));
		    T8B = VADD(T8y, T8z);
		    ST(&(x[WS(ios, 24)]), T8B, dist, &(x[0]));
		    T8x = VADD(T8m, T8v);
		    ST(&(x[WS(ios, 8)]), T8x, dist, &(x[0]));
		    T8A = VSUB(T8y, T8z);
		    ST(&(x[WS(ios, 40)]), T8A, dist, &(x[0]));
	       }
	  }
	  {
	       V T90, T9m, T9n, T97, T9a, T9g, T9j, T9b;
	       {
		    V T8S, T9h, T93, T9f, T8Z, T9e, T96, T9i, T8R, T92;
		    T8R = VMUL(LDK(KP707106781), VADD(T8P, T8Q));
		    T8S = VADD(T8O, T8R);
		    T9h = VSUB(T8O, T8R);
		    T92 = VMUL(LDK(KP707106781), VSUB(T8Q, T8P));
		    T93 = VADD(T91, T92);
		    T9f = VSUB(T92, T91);
		    {
			 V T8V, T8Y, T94, T95;
			 T8V = VFMA(LDK(KP923879532), T8T, VMUL(LDK(KP382683432), T8U));
			 T8Y = VFNMS(LDK(KP382683432), T8X, VMUL(LDK(KP923879532), T8W));
			 T8Z = VADD(T8V, T8Y);
			 T9e = VSUB(T8Y, T8V);
			 T94 = VFNMS(LDK(KP382683432), T8T, VMUL(LDK(KP923879532), T8U));
			 T95 = VFMA(LDK(KP382683432), T8W, VMUL(LDK(KP923879532), T8X));
			 T96 = VADD(T94, T95);
			 T9i = VSUB(T95, T94);
		    }
		    T90 = VADD(T8S, T8Z);
		    T9m = VBYI(VADD(T9f, T9e));
		    T9n = VADD(T9h, T9i);
		    T97 = VBYI(VADD(T93, T96));
		    T9a = VSUB(T8S, T8Z);
		    T9g = VBYI(VSUB(T9e, T9f));
		    T9j = VSUB(T9h, T9i);
		    T9b = VBYI(VSUB(T96, T93));
	       }
	       {
		    V T98, T9o, T9p, T99;
		    T98 = VSUB(T90, T97);
		    ST(&(x[WS(ios, 60)]), T98, dist, &(x[0]));
		    T9o = VADD(T9m, T9n);
		    ST(&(x[WS(ios, 12)]), T9o, dist, &(x[0]));
		    T9p = VSUB(T9n, T9m);
		    ST(&(x[WS(ios, 52)]), T9p, dist, &(x[0]));
		    T99 = VADD(T90, T97);
		    ST(&(x[WS(ios, 4)]), T99, dist, &(x[0]));
	       }
	       {
		    V T9c, T9k, T9l, T9d;
		    T9c = VSUB(T9a, T9b);
		    ST(&(x[WS(ios, 36)]), T9c, dist, &(x[0]));
		    T9k = VADD(T9g, T9j);
		    ST(&(x[WS(ios, 20)]), T9k, dist, &(x[0]));
		    T9l = VSUB(T9j, T9g);
		    ST(&(x[WS(ios, 44)]), T9l, dist, &(x[0]));
		    T9d = VADD(T9a, T9b);
		    ST(&(x[WS(ios, 28)]), T9d, dist, &(x[0]));
	       }
	  }
	  {
	       V T7A, T7W, T7X, T7H, T7K, T7Q, T7T, T7L;
	       {
		    V T7s, T7R, T7G, T7S, T7z, T7O, T7D, T7P;
		    {
			 V T7q, T7r, T7E, T7F;
			 T7q = VSUB(T5W, T63);
			 T7r = VSUB(T71, T70);
			 T7s = VADD(T7q, T7r);
			 T7R = VSUB(T7q, T7r);
			 T7E = VFNMS(LDK(KP555570233), T7t, VMUL(LDK(KP831469612), T7u));
			 T7F = VFMA(LDK(KP555570233), T7w, VMUL(LDK(KP831469612), T7x));
			 T7G = VADD(T7E, T7F);
			 T7S = VSUB(T7F, T7E);
		    }
		    {
			 V T7v, T7y, T7B, T7C;
			 T7v = VFMA(LDK(KP831469612), T7t, VMUL(LDK(KP555570233), T7u));
			 T7y = VFNMS(LDK(KP555570233), T7x, VMUL(LDK(KP831469612), T7w));
			 T7z = VADD(T7v, T7y);
			 T7O = VSUB(T7y, T7v);
			 T7B = VSUB(T6Y, T6X);
			 T7C = VSUB(T6i, T6b);
			 T7D = VADD(T7B, T7C);
			 T7P = VSUB(T7C, T7B);
		    }
		    T7A = VADD(T7s, T7z);
		    T7W = VBYI(VADD(T7P, T7O));
		    T7X = VADD(T7R, T7S);
		    T7H = VBYI(VADD(T7D, T7G));
		    T7K = VSUB(T7s, T7z);
		    T7Q = VBYI(VSUB(T7O, T7P));
		    T7T = VSUB(T7R, T7S);
		    T7L = VBYI(VSUB(T7G, T7D));
	       }
	       {
		    V T7I, T7Y, T7Z, T7J;
		    T7I = VSUB(T7A, T7H);
		    ST(&(x[WS(ios, 58)]), T7I, dist, &(x[0]));
		    T7Y = VADD(T7W, T7X);
		    ST(&(x[WS(ios, 10)]), T7Y, dist, &(x[0]));
		    T7Z = VSUB(T7X, T7W);
		    ST(&(x[WS(ios, 54)]), T7Z, dist, &(x[0]));
		    T7J = VADD(T7A, T7H);
		    ST(&(x[WS(ios, 6)]), T7J, dist, &(x[0]));
	       }
	       {
		    V T7M, T7U, T7V, T7N;
		    T7M = VSUB(T7K, T7L);
		    ST(&(x[WS(ios, 38)]), T7M, dist, &(x[0]));
		    T7U = VADD(T7Q, T7T);
		    ST(&(x[WS(ios, 22)]), T7U, dist, &(x[0]));
		    T7V = VSUB(T7T, T7Q);
		    ST(&(x[WS(ios, 42)]), T7V, dist, &(x[0]));
		    T7N = VADD(T7K, T7L);
		    ST(&(x[WS(ios, 26)]), T7N, dist, &(x[0]));
	       }
	  }
	  {
	       V T3e, T3M, T3N, T3x, T3A, T3G, T3J, T3B;
	       {
		    V T1u, T3H, T3w, T3I, T3d, T3E, T3t, T3F;
		    {
			 V TE, T1t, T3u, T3v;
			 TE = VSUB(Tg, TD);
			 T1t = VSUB(T13, T1s);
			 T1u = VADD(TE, T1t);
			 T3H = VSUB(TE, T1t);
			 T3u = VFNMS(LDK(KP634393284), T2k, VMUL(LDK(KP773010453), T29));
			 T3v = VFMA(LDK(KP773010453), T3b, VMUL(LDK(KP634393284), T30));
			 T3w = VADD(T3u, T3v);
			 T3I = VSUB(T3v, T3u);
		    }
		    {
			 V T2l, T3c, T3p, T3s;
			 T2l = VFMA(LDK(KP634393284), T29, VMUL(LDK(KP773010453), T2k));
			 T3c = VFNMS(LDK(KP634393284), T3b, VMUL(LDK(KP773010453), T30));
			 T3d = VADD(T2l, T3c);
			 T3E = VSUB(T3c, T2l);
			 T3p = VSUB(T3h, T3o);
			 T3s = VSUB(T3q, T3r);
			 T3t = VADD(T3p, T3s);
			 T3F = VSUB(T3s, T3p);
		    }
		    T3e = VADD(T1u, T3d);
		    T3M = VBYI(VADD(T3F, T3E));
		    T3N = VADD(T3H, T3I);
		    T3x = VBYI(VADD(T3t, T3w));
		    T3A = VSUB(T1u, T3d);
		    T3G = VBYI(VSUB(T3E, T3F));
		    T3J = VSUB(T3H, T3I);
		    T3B = VBYI(VSUB(T3w, T3t));
	       }
	       {
		    V T3y, T3O, T3P, T3z;
		    T3y = VSUB(T3e, T3x);
		    ST(&(x[WS(ios, 57)]), T3y, dist, &(x[WS(ios, 1)]));
		    T3O = VADD(T3M, T3N);
		    ST(&(x[WS(ios, 9)]), T3O, dist, &(x[WS(ios, 1)]));
		    T3P = VSUB(T3N, T3M);
		    ST(&(x[WS(ios, 55)]), T3P, dist, &(x[WS(ios, 1)]));
		    T3z = VADD(T3e, T3x);
		    ST(&(x[WS(ios, 7)]), T3z, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T3C, T3K, T3L, T3D;
		    T3C = VSUB(T3A, T3B);
		    ST(&(x[WS(ios, 39)]), T3C, dist, &(x[WS(ios, 1)]));
		    T3K = VADD(T3G, T3J);
		    ST(&(x[WS(ios, 23)]), T3K, dist, &(x[WS(ios, 1)]));
		    T3L = VSUB(T3J, T3G);
		    ST(&(x[WS(ios, 41)]), T3L, dist, &(x[WS(ios, 1)]));
		    T3D = VADD(T3A, T3B);
		    ST(&(x[WS(ios, 25)]), T3D, dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T4Q, T5g, T5h, T51, T54, T5a, T5d, T55;
	       {
		    V T4A, T5b, T50, T5c, T4P, T58, T4X, T59;
		    {
			 V T4s, T4z, T4Y, T4Z;
			 T4s = VADD(T4q, T4r);
			 T4z = VADD(T4v, T4y);
			 T4A = VADD(T4s, T4z);
			 T5b = VSUB(T4s, T4z);
			 T4Y = VFNMS(LDK(KP290284677), T4D, VMUL(LDK(KP956940335), T4G));
			 T4Z = VFMA(LDK(KP290284677), T4K, VMUL(LDK(KP956940335), T4N));
			 T50 = VADD(T4Y, T4Z);
			 T5c = VSUB(T4Z, T4Y);
		    }
		    {
			 V T4H, T4O, T4T, T4W;
			 T4H = VFMA(LDK(KP956940335), T4D, VMUL(LDK(KP290284677), T4G));
			 T4O = VFNMS(LDK(KP290284677), T4N, VMUL(LDK(KP956940335), T4K));
			 T4P = VADD(T4H, T4O);
			 T58 = VSUB(T4O, T4H);
			 T4T = VADD(T4R, T4S);
			 T4W = VADD(T4U, T4V);
			 T4X = VADD(T4T, T4W);
			 T59 = VSUB(T4W, T4T);
		    }
		    T4Q = VADD(T4A, T4P);
		    T5g = VBYI(VADD(T59, T58));
		    T5h = VADD(T5b, T5c);
		    T51 = VBYI(VADD(T4X, T50));
		    T54 = VSUB(T4A, T4P);
		    T5a = VBYI(VSUB(T58, T59));
		    T5d = VSUB(T5b, T5c);
		    T55 = VBYI(VSUB(T50, T4X));
	       }
	       {
		    V T52, T5i, T5j, T53;
		    T52 = VSUB(T4Q, T51);
		    ST(&(x[WS(ios, 61)]), T52, dist, &(x[WS(ios, 1)]));
		    T5i = VADD(T5g, T5h);
		    ST(&(x[WS(ios, 13)]), T5i, dist, &(x[WS(ios, 1)]));
		    T5j = VSUB(T5h, T5g);
		    ST(&(x[WS(ios, 51)]), T5j, dist, &(x[WS(ios, 1)]));
		    T53 = VADD(T4Q, T51);
		    ST(&(x[WS(ios, 3)]), T53, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T56, T5e, T5f, T57;
		    T56 = VSUB(T54, T55);
		    ST(&(x[WS(ios, 35)]), T56, dist, &(x[WS(ios, 1)]));
		    T5e = VADD(T5a, T5d);
		    ST(&(x[WS(ios, 19)]), T5e, dist, &(x[WS(ios, 1)]));
		    T5f = VSUB(T5d, T5a);
		    ST(&(x[WS(ios, 45)]), T5f, dist, &(x[WS(ios, 1)]));
		    T57 = VADD(T54, T55);
		    ST(&(x[WS(ios, 29)]), T57, dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T6U, T7m, T7n, T77, T7a, T7g, T7j, T7b;
	       {
		    V T6k, T7h, T76, T7i, T6T, T7e, T73, T7f;
		    {
			 V T64, T6j, T74, T75;
			 T64 = VADD(T5W, T63);
			 T6j = VADD(T6b, T6i);
			 T6k = VADD(T64, T6j);
			 T7h = VSUB(T64, T6j);
			 T74 = VFNMS(LDK(KP195090322), T6v, VMUL(LDK(KP980785280), T6A));
			 T75 = VFMA(LDK(KP195090322), T6M, VMUL(LDK(KP980785280), T6R));
			 T76 = VADD(T74, T75);
			 T7i = VSUB(T75, T74);
		    }
		    {
			 V T6B, T6S, T6Z, T72;
			 T6B = VFMA(LDK(KP980785280), T6v, VMUL(LDK(KP195090322), T6A));
			 T6S = VFNMS(LDK(KP195090322), T6R, VMUL(LDK(KP980785280), T6M));
			 T6T = VADD(T6B, T6S);
			 T7e = VSUB(T6S, T6B);
			 T6Z = VADD(T6X, T6Y);
			 T72 = VADD(T70, T71);
			 T73 = VADD(T6Z, T72);
			 T7f = VSUB(T72, T6Z);
		    }
		    T6U = VADD(T6k, T6T);
		    T7m = VBYI(VADD(T7f, T7e));
		    T7n = VADD(T7h, T7i);
		    T77 = VBYI(VADD(T73, T76));
		    T7a = VSUB(T6k, T6T);
		    T7g = VBYI(VSUB(T7e, T7f));
		    T7j = VSUB(T7h, T7i);
		    T7b = VBYI(VSUB(T76, T73));
	       }
	       {
		    V T78, T7o, T7p, T79;
		    T78 = VSUB(T6U, T77);
		    ST(&(x[WS(ios, 62)]), T78, dist, &(x[0]));
		    T7o = VADD(T7m, T7n);
		    ST(&(x[WS(ios, 14)]), T7o, dist, &(x[0]));
		    T7p = VSUB(T7n, T7m);
		    ST(&(x[WS(ios, 50)]), T7p, dist, &(x[0]));
		    T79 = VADD(T6U, T77);
		    ST(&(x[WS(ios, 2)]), T79, dist, &(x[0]));
	       }
	       {
		    V T7c, T7k, T7l, T7d;
		    T7c = VSUB(T7a, T7b);
		    ST(&(x[WS(ios, 34)]), T7c, dist, &(x[0]));
		    T7k = VADD(T7g, T7j);
		    ST(&(x[WS(ios, 18)]), T7k, dist, &(x[0]));
		    T7l = VSUB(T7j, T7g);
		    ST(&(x[WS(ios, 46)]), T7l, dist, &(x[0]));
		    T7d = VADD(T7a, T7b);
		    ST(&(x[WS(ios, 30)]), T7d, dist, &(x[0]));
	       }
	  }
	  {
	       V T40, T4m, T4n, T47, T4a, T4g, T4j, T4b;
	       {
		    V T3S, T4h, T46, T4i, T3Z, T4e, T43, T4f;
		    {
			 V T3Q, T3R, T44, T45;
			 T3Q = VADD(Tg, TD);
			 T3R = VADD(T3r, T3q);
			 T3S = VADD(T3Q, T3R);
			 T4h = VSUB(T3Q, T3R);
			 T44 = VFNMS(LDK(KP098017140), T3U, VMUL(LDK(KP995184726), T3T));
			 T45 = VFMA(LDK(KP995184726), T3X, VMUL(LDK(KP098017140), T3W));
			 T46 = VADD(T44, T45);
			 T4i = VSUB(T45, T44);
		    }
		    {
			 V T3V, T3Y, T41, T42;
			 T3V = VFMA(LDK(KP098017140), T3T, VMUL(LDK(KP995184726), T3U));
			 T3Y = VFNMS(LDK(KP098017140), T3X, VMUL(LDK(KP995184726), T3W));
			 T3Z = VADD(T3V, T3Y);
			 T4e = VSUB(T3Y, T3V);
			 T41 = VADD(T3o, T3h);
			 T42 = VADD(T1s, T13);
			 T43 = VADD(T41, T42);
			 T4f = VSUB(T42, T41);
		    }
		    T40 = VADD(T3S, T3Z);
		    T4m = VBYI(VADD(T4f, T4e));
		    T4n = VADD(T4h, T4i);
		    T47 = VBYI(VADD(T43, T46));
		    T4a = VSUB(T3S, T3Z);
		    T4g = VBYI(VSUB(T4e, T4f));
		    T4j = VSUB(T4h, T4i);
		    T4b = VBYI(VSUB(T46, T43));
	       }
	       {
		    V T48, T4o, T4p, T49;
		    T48 = VSUB(T40, T47);
		    ST(&(x[WS(ios, 63)]), T48, dist, &(x[WS(ios, 1)]));
		    T4o = VADD(T4m, T4n);
		    ST(&(x[WS(ios, 15)]), T4o, dist, &(x[WS(ios, 1)]));
		    T4p = VSUB(T4n, T4m);
		    ST(&(x[WS(ios, 49)]), T4p, dist, &(x[WS(ios, 1)]));
		    T49 = VADD(T40, T47);
		    ST(&(x[WS(ios, 1)]), T49, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T4c, T4k, T4l, T4d;
		    T4c = VSUB(T4a, T4b);
		    ST(&(x[WS(ios, 33)]), T4c, dist, &(x[WS(ios, 1)]));
		    T4k = VADD(T4g, T4j);
		    ST(&(x[WS(ios, 17)]), T4k, dist, &(x[WS(ios, 1)]));
		    T4l = VSUB(T4j, T4g);
		    ST(&(x[WS(ios, 47)]), T4l, dist, &(x[WS(ios, 1)]));
		    T4d = VADD(T4a, T4b);
		    ST(&(x[WS(ios, 31)]), T4d, dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T5u, T5Q, T5R, T5B, T5E, T5K, T5N, T5F;
	       {
		    V T5m, T5L, T5A, T5M, T5t, T5I, T5x, T5J;
		    {
			 V T5k, T5l, T5y, T5z;
			 T5k = VSUB(T4q, T4r);
			 T5l = VSUB(T4V, T4U);
			 T5m = VADD(T5k, T5l);
			 T5L = VSUB(T5k, T5l);
			 T5y = VFNMS(LDK(KP471396736), T5n, VMUL(LDK(KP881921264), T5o));
			 T5z = VFMA(LDK(KP471396736), T5q, VMUL(LDK(KP881921264), T5r));
			 T5A = VADD(T5y, T5z);
			 T5M = VSUB(T5z, T5y);
		    }
		    {
			 V T5p, T5s, T5v, T5w;
			 T5p = VFMA(LDK(KP881921264), T5n, VMUL(LDK(KP471396736), T5o));
			 T5s = VFNMS(LDK(KP471396736), T5r, VMUL(LDK(KP881921264), T5q));
			 T5t = VADD(T5p, T5s);
			 T5I = VSUB(T5s, T5p);
			 T5v = VSUB(T4S, T4R);
			 T5w = VSUB(T4y, T4v);
			 T5x = VADD(T5v, T5w);
			 T5J = VSUB(T5w, T5v);
		    }
		    T5u = VADD(T5m, T5t);
		    T5Q = VBYI(VADD(T5J, T5I));
		    T5R = VADD(T5L, T5M);
		    T5B = VBYI(VADD(T5x, T5A));
		    T5E = VSUB(T5m, T5t);
		    T5K = VBYI(VSUB(T5I, T5J));
		    T5N = VSUB(T5L, T5M);
		    T5F = VBYI(VSUB(T5A, T5x));
	       }
	       {
		    V T5C, T5S, T5T, T5D;
		    T5C = VSUB(T5u, T5B);
		    ST(&(x[WS(ios, 59)]), T5C, dist, &(x[WS(ios, 1)]));
		    T5S = VADD(T5Q, T5R);
		    ST(&(x[WS(ios, 11)]), T5S, dist, &(x[WS(ios, 1)]));
		    T5T = VSUB(T5R, T5Q);
		    ST(&(x[WS(ios, 53)]), T5T, dist, &(x[WS(ios, 1)]));
		    T5D = VADD(T5u, T5B);
		    ST(&(x[WS(ios, 5)]), T5D, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T5G, T5O, T5P, T5H;
		    T5G = VSUB(T5E, T5F);
		    ST(&(x[WS(ios, 37)]), T5G, dist, &(x[WS(ios, 1)]));
		    T5O = VADD(T5K, T5N);
		    ST(&(x[WS(ios, 21)]), T5O, dist, &(x[WS(ios, 1)]));
		    T5P = VSUB(T5N, T5K);
		    ST(&(x[WS(ios, 43)]), T5P, dist, &(x[WS(ios, 1)]));
		    T5H = VADD(T5E, T5F);
		    ST(&(x[WS(ios, 27)]), T5H, dist, &(x[WS(ios, 1)]));
	       }
	  }
     }
     END_SIMD();
     return W;
}

static const tw_instr twinstr[] = {
     VTW(1),
     VTW(2),
     VTW(3),
     VTW(4),
     VTW(5),
     VTW(6),
     VTW(7),
     VTW(8),
     VTW(9),
     VTW(10),
     VTW(11),
     VTW(12),
     VTW(13),
     VTW(14),
     VTW(15),
     VTW(16),
     VTW(17),
     VTW(18),
     VTW(19),
     VTW(20),
     VTW(21),
     VTW(22),
     VTW(23),
     VTW(24),
     VTW(25),
     VTW(26),
     VTW(27),
     VTW(28),
     VTW(29),
     VTW(30),
     VTW(31),
     VTW(32),
     VTW(33),
     VTW(34),
     VTW(35),
     VTW(36),
     VTW(37),
     VTW(38),
     VTW(39),
     VTW(40),
     VTW(41),
     VTW(42),
     VTW(43),
     VTW(44),
     VTW(45),
     VTW(46),
     VTW(47),
     VTW(48),
     VTW(49),
     VTW(50),
     VTW(51),
     VTW(52),
     VTW(53),
     VTW(54),
     VTW(55),
     VTW(56),
     VTW(57),
     VTW(58),
     VTW(59),
     VTW(60),
     VTW(61),
     VTW(62),
     VTW(63),
     {TW_NEXT, VL, 0}
};

static const ct_desc desc = { 64, "t1fv_64", twinstr, {467, 198, 52, 0}, &GENUS, 0, 0, 0 };

void X(codelet_t1fv_64) (planner *p) {
     X(kdft_dit_register) (p, t1fv_64, &desc);
}
