/*
 * Copyright (c) 2003 Matteo Frigo
 * Copyright (c) 2003 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Sat Apr 19 18:05:47 EDT 2003 */

#include "codelet-dft.h"

/* Generated by: /homea/athena/fftw3/genfft/gen_twiddle_c -simd -trivial-stores -compact -variables 4 -n 64 -name t1bv_64 -include t1b.h -sign 1 */

/*
 * This function contains 519 FP additions, 250 FP multiplications,
 * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
 * 111 stack variables, and 128 memory accesses
 */
/*
 * Generator Id's : 
 * $Id: algsimp.ml,v 1.7 2003/03/15 20:29:42 stevenj Exp $
 * $Id: fft.ml,v 1.2 2003/03/15 20:29:42 stevenj Exp $
 * $Id: gen_twiddle_c.ml,v 1.7 2003/04/16 19:51:27 athena Exp $
 */

#include "t1b.h"

static const R *t1bv_64(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
     int i;
     R *x;
     x = ii;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 126)) {
	  V T5W, T71, Tg, T4R, T3r, T4M, T8t, T93, TD, T4N, T8o, T94, T3k, T4S, T63;
	  V T6Y, T6b, T6V, T8h, T8Z, T13, T4U, T3f, T4H, T6i, T6W, T8k, T90, T1s, T4V;
	  V T3g, T4K, T89, T8V, T8c, T8W, T6M, T7x, T6R, T7w, T30, T5n, T5o, T3b, T3W;
	  V T4z, T4C, T3X, T82, T8S, T85, T8T, T6v, T7u, T6A, T7t, T29, T5k, T5l, T2k;
	  V T3T, T4s, T4v, T3U;
	  {
	       V T1, T3, T5U, T3n, T3p, T5V, T9, T6Z, Te, T70, T2, T3m, T3o, T4, Tf;
	       T1 = LD(&(x[0]), dist, &(x[0]));
	       T2 = LD(&(x[WS(ios, 32)]), dist, &(x[0]));
	       T3 = BYTW(&(W[TWVL * 62]), T2);
	       T5U = VADD(T1, T3);
	       T3m = LD(&(x[WS(ios, 16)]), dist, &(x[0]));
	       T3n = BYTW(&(W[TWVL * 30]), T3m);
	       T3o = LD(&(x[WS(ios, 48)]), dist, &(x[0]));
	       T3p = BYTW(&(W[TWVL * 94]), T3o);
	       T5V = VADD(T3n, T3p);
	       {
		    V T6, T8, T5, T7;
		    T5 = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
		    T6 = BYTW(&(W[TWVL * 14]), T5);
		    T7 = LD(&(x[WS(ios, 40)]), dist, &(x[0]));
		    T8 = BYTW(&(W[TWVL * 78]), T7);
		    T9 = VSUB(T6, T8);
		    T6Z = VADD(T6, T8);
	       }
	       {
		    V Tb, Td, Ta, Tc;
		    Ta = LD(&(x[WS(ios, 56)]), dist, &(x[0]));
		    Tb = BYTW(&(W[TWVL * 110]), Ta);
		    Tc = LD(&(x[WS(ios, 24)]), dist, &(x[0]));
		    Td = BYTW(&(W[TWVL * 46]), Tc);
		    Te = VSUB(Tb, Td);
		    T70 = VADD(Tb, Td);
	       }
	       T5W = VSUB(T5U, T5V);
	       T71 = VSUB(T6Z, T70);
	       T4 = VSUB(T1, T3);
	       Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
	       Tg = VSUB(T4, Tf);
	       T4R = VADD(T4, Tf);
	       {
		    V T3l, T3q, T8r, T8s;
		    T3l = VMUL(LDK(KP707106781), VSUB(T9, Te));
		    T3q = VSUB(T3n, T3p);
		    T3r = VSUB(T3l, T3q);
		    T4M = VADD(T3q, T3l);
		    T8r = VADD(T5U, T5V);
		    T8s = VADD(T6Z, T70);
		    T8t = VSUB(T8r, T8s);
		    T93 = VADD(T8r, T8s);
	       }
	  }
	  {
	       V Tl, T5X, TB, T60, Tq, T5Y, Tw, T61;
	       {
		    V Ti, Tk, Th, Tj;
		    Th = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
		    Ti = BYTW(&(W[TWVL * 6]), Th);
		    Tj = LD(&(x[WS(ios, 36)]), dist, &(x[0]));
		    Tk = BYTW(&(W[TWVL * 70]), Tj);
		    Tl = VSUB(Ti, Tk);
		    T5X = VADD(Ti, Tk);
	       }
	       {
		    V Ty, TA, Tx, Tz;
		    Tx = LD(&(x[WS(ios, 60)]), dist, &(x[0]));
		    Ty = BYTW(&(W[TWVL * 118]), Tx);
		    Tz = LD(&(x[WS(ios, 28)]), dist, &(x[0]));
		    TA = BYTW(&(W[TWVL * 54]), Tz);
		    TB = VSUB(Ty, TA);
		    T60 = VADD(Ty, TA);
	       }
	       {
		    V Tn, Tp, Tm, To;
		    Tm = LD(&(x[WS(ios, 20)]), dist, &(x[0]));
		    Tn = BYTW(&(W[TWVL * 38]), Tm);
		    To = LD(&(x[WS(ios, 52)]), dist, &(x[0]));
		    Tp = BYTW(&(W[TWVL * 102]), To);
		    Tq = VSUB(Tn, Tp);
		    T5Y = VADD(Tn, Tp);
	       }
	       {
		    V Tt, Tv, Ts, Tu;
		    Ts = LD(&(x[WS(ios, 12)]), dist, &(x[0]));
		    Tt = BYTW(&(W[TWVL * 22]), Ts);
		    Tu = LD(&(x[WS(ios, 44)]), dist, &(x[0]));
		    Tv = BYTW(&(W[TWVL * 86]), Tu);
		    Tw = VSUB(Tt, Tv);
		    T61 = VADD(Tt, Tv);
	       }
	       {
		    V Tr, TC, T8m, T8n;
		    Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
		    TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
		    TD = VSUB(Tr, TC);
		    T4N = VADD(Tr, TC);
		    T8m = VADD(T5X, T5Y);
		    T8n = VADD(T60, T61);
		    T8o = VSUB(T8m, T8n);
		    T94 = VADD(T8m, T8n);
	       }
	       {
		    V T3i, T3j, T5Z, T62;
		    T3i = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
		    T3j = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
		    T3k = VSUB(T3i, T3j);
		    T4S = VADD(T3i, T3j);
		    T5Z = VSUB(T5X, T5Y);
		    T62 = VSUB(T60, T61);
		    T63 = VMUL(LDK(KP707106781), VADD(T5Z, T62));
		    T6Y = VMUL(LDK(KP707106781), VSUB(T5Z, T62));
	       }
	  }
	  {
	       V TJ, T68, T10, T65, TO, T69, TU, T66;
	       {
		    V TG, TI, TF, TH;
		    TF = LD(&(x[WS(ios, 10)]), dist, &(x[0]));
		    TG = BYTW(&(W[TWVL * 18]), TF);
		    TH = LD(&(x[WS(ios, 42)]), dist, &(x[0]));
		    TI = BYTW(&(W[TWVL * 82]), TH);
		    TJ = VSUB(TG, TI);
		    T68 = VADD(TG, TI);
	       }
	       {
		    V TX, TZ, TW, TY;
		    TW = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
		    TX = BYTW(&(W[TWVL * 2]), TW);
		    TY = LD(&(x[WS(ios, 34)]), dist, &(x[0]));
		    TZ = BYTW(&(W[TWVL * 66]), TY);
		    T10 = VSUB(TX, TZ);
		    T65 = VADD(TX, TZ);
	       }
	       {
		    V TL, TN, TK, TM;
		    TK = LD(&(x[WS(ios, 58)]), dist, &(x[0]));
		    TL = BYTW(&(W[TWVL * 114]), TK);
		    TM = LD(&(x[WS(ios, 26)]), dist, &(x[0]));
		    TN = BYTW(&(W[TWVL * 50]), TM);
		    TO = VSUB(TL, TN);
		    T69 = VADD(TL, TN);
	       }
	       {
		    V TR, TT, TQ, TS;
		    TQ = LD(&(x[WS(ios, 18)]), dist, &(x[0]));
		    TR = BYTW(&(W[TWVL * 34]), TQ);
		    TS = LD(&(x[WS(ios, 50)]), dist, &(x[0]));
		    TT = BYTW(&(W[TWVL * 98]), TS);
		    TU = VSUB(TR, TT);
		    T66 = VADD(TR, TT);
	       }
	       {
		    V T67, T6a, T8f, T8g;
		    T67 = VSUB(T65, T66);
		    T6a = VSUB(T68, T69);
		    T6b = VFMA(LDK(KP382683432), T67, VMUL(LDK(KP923879532), T6a));
		    T6V = VFNMS(LDK(KP382683432), T6a, VMUL(LDK(KP923879532), T67));
		    T8f = VADD(T65, T66);
		    T8g = VADD(T68, T69);
		    T8h = VSUB(T8f, T8g);
		    T8Z = VADD(T8f, T8g);
		    {
			 V TV, T4F, T12, T4G, TP, T11;
			 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
			 TV = VSUB(TP, TU);
			 T4F = VADD(TU, TP);
			 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
			 T12 = VSUB(T10, T11);
			 T4G = VADD(T10, T11);
			 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
			 T4U = VFNMS(LDK(KP195090322), T4F, VMUL(LDK(KP980785280), T4G));
			 T3f = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
			 T4H = VFMA(LDK(KP980785280), T4F, VMUL(LDK(KP195090322), T4G));
		    }
	       }
	  }
	  {
	       V T18, T6c, T1p, T6f, T1d, T6d, T1j, T6g;
	       {
		    V T15, T17, T14, T16;
		    T14 = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
		    T15 = BYTW(&(W[TWVL * 10]), T14);
		    T16 = LD(&(x[WS(ios, 38)]), dist, &(x[0]));
		    T17 = BYTW(&(W[TWVL * 74]), T16);
		    T18 = VSUB(T15, T17);
		    T6c = VADD(T15, T17);
	       }
	       {
		    V T1m, T1o, T1l, T1n;
		    T1l = LD(&(x[WS(ios, 62)]), dist, &(x[0]));
		    T1m = BYTW(&(W[TWVL * 122]), T1l);
		    T1n = LD(&(x[WS(ios, 30)]), dist, &(x[0]));
		    T1o = BYTW(&(W[TWVL * 58]), T1n);
		    T1p = VSUB(T1m, T1o);
		    T6f = VADD(T1m, T1o);
	       }
	       {
		    V T1a, T1c, T19, T1b;
		    T19 = LD(&(x[WS(ios, 54)]), dist, &(x[0]));
		    T1a = BYTW(&(W[TWVL * 106]), T19);
		    T1b = LD(&(x[WS(ios, 22)]), dist, &(x[0]));
		    T1c = BYTW(&(W[TWVL * 42]), T1b);
		    T1d = VSUB(T1a, T1c);
		    T6d = VADD(T1a, T1c);
	       }
	       {
		    V T1g, T1i, T1f, T1h;
		    T1f = LD(&(x[WS(ios, 14)]), dist, &(x[0]));
		    T1g = BYTW(&(W[TWVL * 26]), T1f);
		    T1h = LD(&(x[WS(ios, 46)]), dist, &(x[0]));
		    T1i = BYTW(&(W[TWVL * 90]), T1h);
		    T1j = VSUB(T1g, T1i);
		    T6g = VADD(T1g, T1i);
	       }
	       {
		    V T6e, T6h, T8i, T8j;
		    T6e = VSUB(T6c, T6d);
		    T6h = VSUB(T6f, T6g);
		    T6i = VFNMS(LDK(KP382683432), T6h, VMUL(LDK(KP923879532), T6e));
		    T6W = VFMA(LDK(KP923879532), T6h, VMUL(LDK(KP382683432), T6e));
		    T8i = VADD(T6f, T6g);
		    T8j = VADD(T6c, T6d);
		    T8k = VSUB(T8i, T8j);
		    T90 = VADD(T8i, T8j);
		    {
			 V T1k, T4I, T1r, T4J, T1e, T1q;
			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
			 T1k = VSUB(T1e, T1j);
			 T4I = VADD(T1j, T1e);
			 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
			 T1r = VSUB(T1p, T1q);
			 T4J = VADD(T1p, T1q);
			 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
			 T4V = VFMA(LDK(KP195090322), T4I, VMUL(LDK(KP980785280), T4J));
			 T3g = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
			 T4K = VFNMS(LDK(KP195090322), T4J, VMUL(LDK(KP980785280), T4I));
		    }
	       }
	  }
	  {
	       V T2q, T6C, T2Y, T6O, T35, T6N, T2v, T6D, T2B, T6F, T2N, T6J, T2S, T6K, T2G;
	       V T6G;
	       {
		    V T2n, T2p, T2m, T2o;
		    T2m = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
		    T2n = BYTW(&(W[TWVL * 4]), T2m);
		    T2o = LD(&(x[WS(ios, 35)]), dist, &(x[WS(ios, 1)]));
		    T2p = BYTW(&(W[TWVL * 68]), T2o);
		    T2q = VSUB(T2n, T2p);
		    T6C = VADD(T2n, T2p);
	       }
	       {
		    V T2V, T2X, T2U, T2W;
		    T2U = LD(&(x[WS(ios, 15)]), dist, &(x[WS(ios, 1)]));
		    T2V = BYTW(&(W[TWVL * 28]), T2U);
		    T2W = LD(&(x[WS(ios, 47)]), dist, &(x[WS(ios, 1)]));
		    T2X = BYTW(&(W[TWVL * 92]), T2W);
		    T2Y = VSUB(T2V, T2X);
		    T6O = VADD(T2V, T2X);
	       }
	       {
		    V T32, T34, T31, T33;
		    T31 = LD(&(x[WS(ios, 63)]), dist, &(x[WS(ios, 1)]));
		    T32 = BYTW(&(W[TWVL * 124]), T31);
		    T33 = LD(&(x[WS(ios, 31)]), dist, &(x[WS(ios, 1)]));
		    T34 = BYTW(&(W[TWVL * 60]), T33);
		    T35 = VSUB(T32, T34);
		    T6N = VADD(T32, T34);
	       }
	       {
		    V T2s, T2u, T2r, T2t;
		    T2r = LD(&(x[WS(ios, 19)]), dist, &(x[WS(ios, 1)]));
		    T2s = BYTW(&(W[TWVL * 36]), T2r);
		    T2t = LD(&(x[WS(ios, 51)]), dist, &(x[WS(ios, 1)]));
		    T2u = BYTW(&(W[TWVL * 100]), T2t);
		    T2v = VSUB(T2s, T2u);
		    T6D = VADD(T2s, T2u);
	       }
	       {
		    V T2y, T2A, T2x, T2z;
		    T2x = LD(&(x[WS(ios, 59)]), dist, &(x[WS(ios, 1)]));
		    T2y = BYTW(&(W[TWVL * 116]), T2x);
		    T2z = LD(&(x[WS(ios, 27)]), dist, &(x[WS(ios, 1)]));
		    T2A = BYTW(&(W[TWVL * 52]), T2z);
		    T2B = VSUB(T2y, T2A);
		    T6F = VADD(T2y, T2A);
	       }
	       {
		    V T2K, T2M, T2J, T2L;
		    T2J = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
		    T2K = BYTW(&(W[TWVL * 12]), T2J);
		    T2L = LD(&(x[WS(ios, 39)]), dist, &(x[WS(ios, 1)]));
		    T2M = BYTW(&(W[TWVL * 76]), T2L);
		    T2N = VSUB(T2K, T2M);
		    T6J = VADD(T2K, T2M);
	       }
	       {
		    V T2P, T2R, T2O, T2Q;
		    T2O = LD(&(x[WS(ios, 55)]), dist, &(x[WS(ios, 1)]));
		    T2P = BYTW(&(W[TWVL * 108]), T2O);
		    T2Q = LD(&(x[WS(ios, 23)]), dist, &(x[WS(ios, 1)]));
		    T2R = BYTW(&(W[TWVL * 44]), T2Q);
		    T2S = VSUB(T2P, T2R);
		    T6K = VADD(T2P, T2R);
	       }
	       {
		    V T2D, T2F, T2C, T2E;
		    T2C = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)]));
		    T2D = BYTW(&(W[TWVL * 20]), T2C);
		    T2E = LD(&(x[WS(ios, 43)]), dist, &(x[WS(ios, 1)]));
		    T2F = BYTW(&(W[TWVL * 84]), T2E);
		    T2G = VSUB(T2D, T2F);
		    T6G = VADD(T2D, T2F);
	       }
	       {
		    V T87, T88, T8a, T8b;
		    T87 = VADD(T6N, T6O);
		    T88 = VADD(T6J, T6K);
		    T89 = VSUB(T87, T88);
		    T8V = VADD(T87, T88);
		    T8a = VADD(T6C, T6D);
		    T8b = VADD(T6F, T6G);
		    T8c = VSUB(T8a, T8b);
		    T8W = VADD(T8a, T8b);
	       }
	       {
		    V T6L, T6P, T6I, T6Q, T6E, T6H;
		    T6L = VSUB(T6J, T6K);
		    T6P = VSUB(T6N, T6O);
		    T6E = VSUB(T6C, T6D);
		    T6H = VSUB(T6F, T6G);
		    T6I = VMUL(LDK(KP707106781), VSUB(T6E, T6H));
		    T6Q = VMUL(LDK(KP707106781), VADD(T6E, T6H));
		    T6M = VSUB(T6I, T6L);
		    T7x = VADD(T6P, T6Q);
		    T6R = VSUB(T6P, T6Q);
		    T7w = VADD(T6L, T6I);
	       }
	       {
		    V T2Z, T4A, T37, T4x, T2I, T4y, T3a, T4B, T2T, T36;
		    T2T = VMUL(LDK(KP707106781), VSUB(T2N, T2S));
		    T2Z = VSUB(T2T, T2Y);
		    T4A = VADD(T2Y, T2T);
		    T36 = VMUL(LDK(KP707106781), VADD(T2N, T2S));
		    T37 = VSUB(T35, T36);
		    T4x = VADD(T35, T36);
		    {
			 V T2w, T2H, T38, T39;
			 T2w = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2q));
			 T2H = VFMA(LDK(KP923879532), T2B, VMUL(LDK(KP382683432), T2G));
			 T2I = VSUB(T2w, T2H);
			 T4y = VADD(T2w, T2H);
			 T38 = VFMA(LDK(KP382683432), T2q, VMUL(LDK(KP923879532), T2v));
			 T39 = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2G));
			 T3a = VSUB(T38, T39);
			 T4B = VADD(T38, T39);
		    }
		    T30 = VSUB(T2I, T2Z);
		    T5n = VSUB(T4x, T4y);
		    T5o = VSUB(T4B, T4A);
		    T3b = VSUB(T37, T3a);
		    T3W = VADD(T2Z, T2I);
		    T4z = VADD(T4x, T4y);
		    T4C = VADD(T4A, T4B);
		    T3X = VADD(T37, T3a);
	       }
	  }
	  {
	       V T1z, T6l, T27, T6x, T2e, T6w, T1E, T6m, T1K, T6o, T1W, T6s, T21, T6t, T1P;
	       V T6p;
	       {
		    V T1w, T1y, T1v, T1x;
		    T1v = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
		    T1w = BYTW(&(W[TWVL * 8]), T1v);
		    T1x = LD(&(x[WS(ios, 37)]), dist, &(x[WS(ios, 1)]));
		    T1y = BYTW(&(W[TWVL * 72]), T1x);
		    T1z = VSUB(T1w, T1y);
		    T6l = VADD(T1w, T1y);
	       }
	       {
		    V T24, T26, T23, T25;
		    T23 = LD(&(x[WS(ios, 17)]), dist, &(x[WS(ios, 1)]));
		    T24 = BYTW(&(W[TWVL * 32]), T23);
		    T25 = LD(&(x[WS(ios, 49)]), dist, &(x[WS(ios, 1)]));
		    T26 = BYTW(&(W[TWVL * 96]), T25);
		    T27 = VSUB(T24, T26);
		    T6x = VADD(T24, T26);
	       }
	       {
		    V T2b, T2d, T2a, T2c;
		    T2a = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
		    T2b = BYTW(&(W[0]), T2a);
		    T2c = LD(&(x[WS(ios, 33)]), dist, &(x[WS(ios, 1)]));
		    T2d = BYTW(&(W[TWVL * 64]), T2c);
		    T2e = VSUB(T2b, T2d);
		    T6w = VADD(T2b, T2d);
	       }
	       {
		    V T1B, T1D, T1A, T1C;
		    T1A = LD(&(x[WS(ios, 21)]), dist, &(x[WS(ios, 1)]));
		    T1B = BYTW(&(W[TWVL * 40]), T1A);
		    T1C = LD(&(x[WS(ios, 53)]), dist, &(x[WS(ios, 1)]));
		    T1D = BYTW(&(W[TWVL * 104]), T1C);
		    T1E = VSUB(T1B, T1D);
		    T6m = VADD(T1B, T1D);
	       }
	       {
		    V T1H, T1J, T1G, T1I;
		    T1G = LD(&(x[WS(ios, 61)]), dist, &(x[WS(ios, 1)]));
		    T1H = BYTW(&(W[TWVL * 120]), T1G);
		    T1I = LD(&(x[WS(ios, 29)]), dist, &(x[WS(ios, 1)]));
		    T1J = BYTW(&(W[TWVL * 56]), T1I);
		    T1K = VSUB(T1H, T1J);
		    T6o = VADD(T1H, T1J);
	       }
	       {
		    V T1T, T1V, T1S, T1U;
		    T1S = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
		    T1T = BYTW(&(W[TWVL * 16]), T1S);
		    T1U = LD(&(x[WS(ios, 41)]), dist, &(x[WS(ios, 1)]));
		    T1V = BYTW(&(W[TWVL * 80]), T1U);
		    T1W = VSUB(T1T, T1V);
		    T6s = VADD(T1T, T1V);
	       }
	       {
		    V T1Y, T20, T1X, T1Z;
		    T1X = LD(&(x[WS(ios, 57)]), dist, &(x[WS(ios, 1)]));
		    T1Y = BYTW(&(W[TWVL * 112]), T1X);
		    T1Z = LD(&(x[WS(ios, 25)]), dist, &(x[WS(ios, 1)]));
		    T20 = BYTW(&(W[TWVL * 48]), T1Z);
		    T21 = VSUB(T1Y, T20);
		    T6t = VADD(T1Y, T20);
	       }
	       {
		    V T1M, T1O, T1L, T1N;
		    T1L = LD(&(x[WS(ios, 13)]), dist, &(x[WS(ios, 1)]));
		    T1M = BYTW(&(W[TWVL * 24]), T1L);
		    T1N = LD(&(x[WS(ios, 45)]), dist, &(x[WS(ios, 1)]));
		    T1O = BYTW(&(W[TWVL * 88]), T1N);
		    T1P = VSUB(T1M, T1O);
		    T6p = VADD(T1M, T1O);
	       }
	       {
		    V T80, T81, T83, T84;
		    T80 = VADD(T6w, T6x);
		    T81 = VADD(T6s, T6t);
		    T82 = VSUB(T80, T81);
		    T8S = VADD(T80, T81);
		    T83 = VADD(T6l, T6m);
		    T84 = VADD(T6o, T6p);
		    T85 = VSUB(T83, T84);
		    T8T = VADD(T83, T84);
	       }
	       {
		    V T6u, T6y, T6r, T6z, T6n, T6q;
		    T6u = VSUB(T6s, T6t);
		    T6y = VSUB(T6w, T6x);
		    T6n = VSUB(T6l, T6m);
		    T6q = VSUB(T6o, T6p);
		    T6r = VMUL(LDK(KP707106781), VSUB(T6n, T6q));
		    T6z = VMUL(LDK(KP707106781), VADD(T6n, T6q));
		    T6v = VSUB(T6r, T6u);
		    T7u = VADD(T6y, T6z);
		    T6A = VSUB(T6y, T6z);
		    T7t = VADD(T6u, T6r);
	       }
	       {
		    V T28, T4t, T2g, T4q, T1R, T4r, T2j, T4u, T22, T2f;
		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
		    T28 = VSUB(T22, T27);
		    T4t = VADD(T27, T22);
		    T2f = VMUL(LDK(KP707106781), VADD(T1W, T21));
		    T2g = VSUB(T2e, T2f);
		    T4q = VADD(T2e, T2f);
		    {
			 V T1F, T1Q, T2h, T2i;
			 T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1z));
			 T1Q = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1P));
			 T1R = VSUB(T1F, T1Q);
			 T4r = VADD(T1F, T1Q);
			 T2h = VFMA(LDK(KP382683432), T1z, VMUL(LDK(KP923879532), T1E));
			 T2i = VFNMS(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
			 T2j = VSUB(T2h, T2i);
			 T4u = VADD(T2h, T2i);
		    }
		    T29 = VSUB(T1R, T28);
		    T5k = VSUB(T4q, T4r);
		    T5l = VSUB(T4u, T4t);
		    T2k = VSUB(T2g, T2j);
		    T3T = VADD(T28, T1R);
		    T4s = VADD(T4q, T4r);
		    T4v = VADD(T4t, T4u);
		    T3U = VADD(T2g, T2j);
	       }
	  }
	  {
	       V T9g, T9m, T9j, T9n;
	       {
		    V T9e, T9f, T9h, T9i;
		    T9e = VADD(T93, T94);
		    T9f = VADD(T8Z, T90);
		    T9g = VSUB(T9e, T9f);
		    T9m = VADD(T9e, T9f);
		    T9h = VADD(T8S, T8T);
		    T9i = VADD(T8V, T8W);
		    T9j = VBYI(VSUB(T9h, T9i));
		    T9n = VADD(T9h, T9i);
	       }
	       {
		    V T9k, T9p, T9l, T9o;
		    T9k = VSUB(T9g, T9j);
		    ST(&(x[WS(ios, 48)]), T9k, dist, &(x[0]));
		    T9p = VADD(T9m, T9n);
		    ST(&(x[0]), T9p, dist, &(x[0]));
		    T9l = VADD(T9g, T9j);
		    ST(&(x[WS(ios, 16)]), T9l, dist, &(x[0]));
		    T9o = VSUB(T9m, T9n);
		    ST(&(x[WS(ios, 32)]), T9o, dist, &(x[0]));
	       }
	  }
	  {
	       V T92, T9b, T97, T9a;
	       {
		    V T91, T95, T8Y, T96, T8U, T8X;
		    T91 = VSUB(T8Z, T90);
		    T95 = VSUB(T93, T94);
		    T8U = VSUB(T8S, T8T);
		    T8X = VSUB(T8V, T8W);
		    T8Y = VMUL(LDK(KP707106781), VSUB(T8U, T8X));
		    T96 = VMUL(LDK(KP707106781), VADD(T8U, T8X));
		    T92 = VBYI(VSUB(T8Y, T91));
		    T9b = VADD(T95, T96);
		    T97 = VSUB(T95, T96);
		    T9a = VBYI(VADD(T91, T8Y));
	       }
	       {
		    V T98, T9d, T99, T9c;
		    T98 = VADD(T92, T97);
		    ST(&(x[WS(ios, 24)]), T98, dist, &(x[0]));
		    T9d = VSUB(T9b, T9a);
		    ST(&(x[WS(ios, 56)]), T9d, dist, &(x[0]));
		    T99 = VSUB(T97, T92);
		    ST(&(x[WS(ios, 40)]), T99, dist, &(x[0]));
		    T9c = VADD(T9a, T9b);
		    ST(&(x[WS(ios, 8)]), T9c, dist, &(x[0]));
	       }
	  }
	  {
	       V T8q, T8O, T8P, T8z, T8C, T8I, T8L, T8D;
	       {
		    V T8p, T8J, T8v, T8G, T8e, T8H, T8y, T8K, T8l, T8u;
		    T8l = VMUL(LDK(KP707106781), VSUB(T8h, T8k));
		    T8p = VSUB(T8l, T8o);
		    T8J = VADD(T8o, T8l);
		    T8u = VMUL(LDK(KP707106781), VADD(T8h, T8k));
		    T8v = VSUB(T8t, T8u);
		    T8G = VADD(T8t, T8u);
		    {
			 V T86, T8d, T8w, T8x;
			 T86 = VFNMS(LDK(KP382683432), T85, VMUL(LDK(KP923879532), T82));
			 T8d = VFMA(LDK(KP923879532), T89, VMUL(LDK(KP382683432), T8c));
			 T8e = VSUB(T86, T8d);
			 T8H = VADD(T86, T8d);
			 T8w = VFMA(LDK(KP382683432), T82, VMUL(LDK(KP923879532), T85));
			 T8x = VFNMS(LDK(KP382683432), T89, VMUL(LDK(KP923879532), T8c));
			 T8y = VSUB(T8w, T8x);
			 T8K = VADD(T8w, T8x);
		    }
		    T8q = VBYI(VSUB(T8e, T8p));
		    T8O = VSUB(T8G, T8H);
		    T8P = VBYI(VSUB(T8K, T8J));
		    T8z = VSUB(T8v, T8y);
		    T8C = VBYI(VADD(T8p, T8e));
		    T8I = VADD(T8G, T8H);
		    T8L = VBYI(VADD(T8J, T8K));
		    T8D = VADD(T8v, T8y);
	       }
	       {
		    V T8A, T8Q, T8R, T8B;
		    T8A = VADD(T8q, T8z);
		    ST(&(x[WS(ios, 20)]), T8A, dist, &(x[0]));
		    T8Q = VSUB(T8O, T8P);
		    ST(&(x[WS(ios, 36)]), T8Q, dist, &(x[0]));
		    T8R = VADD(T8O, T8P);
		    ST(&(x[WS(ios, 28)]), T8R, dist, &(x[0]));
		    T8B = VSUB(T8z, T8q);
		    ST(&(x[WS(ios, 44)]), T8B, dist, &(x[0]));
	       }
	       {
		    V T8E, T8M, T8N, T8F;
		    T8E = VADD(T8C, T8D);
		    ST(&(x[WS(ios, 12)]), T8E, dist, &(x[0]));
		    T8M = VSUB(T8I, T8L);
		    ST(&(x[WS(ios, 60)]), T8M, dist, &(x[0]));
		    T8N = VADD(T8I, T8L);
		    ST(&(x[WS(ios, 4)]), T8N, dist, &(x[0]));
		    T8F = VSUB(T8D, T8C);
		    ST(&(x[WS(ios, 52)]), T8F, dist, &(x[0]));
	       }
	  }
	  {
	       V T7A, T7W, T7X, T7H, T7K, T7Q, T7T, T7L;
	       {
		    V T7s, T7R, T7G, T7S, T7z, T7O, T7D, T7P;
		    {
			 V T7q, T7r, T7E, T7F;
			 T7q = VADD(T5W, T63);
			 T7r = VADD(T6V, T6W);
			 T7s = VSUB(T7q, T7r);
			 T7R = VADD(T7q, T7r);
			 T7E = VFNMS(LDK(KP195090322), T7t, VMUL(LDK(KP980785280), T7u));
			 T7F = VFMA(LDK(KP195090322), T7w, VMUL(LDK(KP980785280), T7x));
			 T7G = VSUB(T7E, T7F);
			 T7S = VADD(T7E, T7F);
		    }
		    {
			 V T7v, T7y, T7B, T7C;
			 T7v = VFMA(LDK(KP980785280), T7t, VMUL(LDK(KP195090322), T7u));
			 T7y = VFNMS(LDK(KP195090322), T7x, VMUL(LDK(KP980785280), T7w));
			 T7z = VSUB(T7v, T7y);
			 T7O = VADD(T7v, T7y);
			 T7B = VADD(T6b, T6i);
			 T7C = VADD(T71, T6Y);
			 T7D = VSUB(T7B, T7C);
			 T7P = VADD(T7C, T7B);
		    }
		    T7A = VADD(T7s, T7z);
		    T7W = VBYI(VADD(T7P, T7O));
		    T7X = VADD(T7R, T7S);
		    T7H = VBYI(VADD(T7D, T7G));
		    T7K = VSUB(T7s, T7z);
		    T7Q = VBYI(VSUB(T7O, T7P));
		    T7T = VSUB(T7R, T7S);
		    T7L = VBYI(VSUB(T7G, T7D));
	       }
	       {
		    V T7I, T7Y, T7Z, T7J;
		    T7I = VSUB(T7A, T7H);
		    ST(&(x[WS(ios, 50)]), T7I, dist, &(x[0]));
		    T7Y = VADD(T7W, T7X);
		    ST(&(x[WS(ios, 2)]), T7Y, dist, &(x[0]));
		    T7Z = VSUB(T7X, T7W);
		    ST(&(x[WS(ios, 62)]), T7Z, dist, &(x[0]));
		    T7J = VADD(T7A, T7H);
		    ST(&(x[WS(ios, 14)]), T7J, dist, &(x[0]));
	       }
	       {
		    V T7M, T7U, T7V, T7N;
		    T7M = VSUB(T7K, T7L);
		    ST(&(x[WS(ios, 46)]), T7M, dist, &(x[0]));
		    T7U = VADD(T7Q, T7T);
		    ST(&(x[WS(ios, 30)]), T7U, dist, &(x[0]));
		    T7V = VSUB(T7T, T7Q);
		    ST(&(x[WS(ios, 34)]), T7V, dist, &(x[0]));
		    T7N = VADD(T7K, T7L);
		    ST(&(x[WS(ios, 18)]), T7N, dist, &(x[0]));
	       }
	  }
	  {
	       V T3e, T3M, T3N, T3x, T3A, T3G, T3J, T3B;
	       {
		    V T1u, T3H, T3w, T3I, T3d, T3E, T3t, T3F;
		    {
			 V TE, T1t, T3u, T3v;
			 TE = VSUB(Tg, TD);
			 T1t = VSUB(T13, T1s);
			 T1u = VSUB(TE, T1t);
			 T3H = VADD(TE, T1t);
			 T3u = VFNMS(LDK(KP471396736), T29, VMUL(LDK(KP881921264), T2k));
			 T3v = VFMA(LDK(KP471396736), T30, VMUL(LDK(KP881921264), T3b));
			 T3w = VSUB(T3u, T3v);
			 T3I = VADD(T3u, T3v);
		    }
		    {
			 V T2l, T3c, T3h, T3s;
			 T2l = VFMA(LDK(KP881921264), T29, VMUL(LDK(KP471396736), T2k));
			 T3c = VFNMS(LDK(KP471396736), T3b, VMUL(LDK(KP881921264), T30));
			 T3d = VSUB(T2l, T3c);
			 T3E = VADD(T2l, T3c);
			 T3h = VSUB(T3f, T3g);
			 T3s = VSUB(T3k, T3r);
			 T3t = VSUB(T3h, T3s);
			 T3F = VADD(T3s, T3h);
		    }
		    T3e = VADD(T1u, T3d);
		    T3M = VBYI(VADD(T3F, T3E));
		    T3N = VADD(T3H, T3I);
		    T3x = VBYI(VADD(T3t, T3w));
		    T3A = VSUB(T1u, T3d);
		    T3G = VBYI(VSUB(T3E, T3F));
		    T3J = VSUB(T3H, T3I);
		    T3B = VBYI(VSUB(T3w, T3t));
	       }
	       {
		    V T3y, T3O, T3P, T3z;
		    T3y = VSUB(T3e, T3x);
		    ST(&(x[WS(ios, 53)]), T3y, dist, &(x[WS(ios, 1)]));
		    T3O = VADD(T3M, T3N);
		    ST(&(x[WS(ios, 5)]), T3O, dist, &(x[WS(ios, 1)]));
		    T3P = VSUB(T3N, T3M);
		    ST(&(x[WS(ios, 59)]), T3P, dist, &(x[WS(ios, 1)]));
		    T3z = VADD(T3e, T3x);
		    ST(&(x[WS(ios, 11)]), T3z, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T3C, T3K, T3L, T3D;
		    T3C = VSUB(T3A, T3B);
		    ST(&(x[WS(ios, 43)]), T3C, dist, &(x[WS(ios, 1)]));
		    T3K = VADD(T3G, T3J);
		    ST(&(x[WS(ios, 27)]), T3K, dist, &(x[WS(ios, 1)]));
		    T3L = VSUB(T3J, T3G);
		    ST(&(x[WS(ios, 37)]), T3L, dist, &(x[WS(ios, 1)]));
		    T3D = VADD(T3A, T3B);
		    ST(&(x[WS(ios, 21)]), T3D, dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T4Q, T5g, T5h, T51, T54, T5a, T5d, T55;
	       {
		    V T4E, T59, T50, T5c, T4P, T5b, T4X, T58;
		    {
			 V T4w, T4D, T4Y, T4Z;
			 T4w = VFNMS(LDK(KP098017140), T4v, VMUL(LDK(KP995184726), T4s));
			 T4D = VFMA(LDK(KP995184726), T4z, VMUL(LDK(KP098017140), T4C));
			 T4E = VSUB(T4w, T4D);
			 T59 = VADD(T4w, T4D);
			 T4Y = VFMA(LDK(KP098017140), T4s, VMUL(LDK(KP995184726), T4v));
			 T4Z = VFNMS(LDK(KP098017140), T4z, VMUL(LDK(KP995184726), T4C));
			 T50 = VSUB(T4Y, T4Z);
			 T5c = VADD(T4Y, T4Z);
		    }
		    {
			 V T4L, T4O, T4T, T4W;
			 T4L = VADD(T4H, T4K);
			 T4O = VADD(T4M, T4N);
			 T4P = VSUB(T4L, T4O);
			 T5b = VADD(T4O, T4L);
			 T4T = VADD(T4R, T4S);
			 T4W = VADD(T4U, T4V);
			 T4X = VSUB(T4T, T4W);
			 T58 = VADD(T4T, T4W);
		    }
		    T4Q = VBYI(VSUB(T4E, T4P));
		    T5g = VSUB(T58, T59);
		    T5h = VBYI(VSUB(T5c, T5b));
		    T51 = VSUB(T4X, T50);
		    T54 = VBYI(VADD(T4P, T4E));
		    T5a = VADD(T58, T59);
		    T5d = VBYI(VADD(T5b, T5c));
		    T55 = VADD(T4X, T50);
	       }
	       {
		    V T52, T5i, T5j, T53;
		    T52 = VADD(T4Q, T51);
		    ST(&(x[WS(ios, 17)]), T52, dist, &(x[WS(ios, 1)]));
		    T5i = VSUB(T5g, T5h);
		    ST(&(x[WS(ios, 33)]), T5i, dist, &(x[WS(ios, 1)]));
		    T5j = VADD(T5g, T5h);
		    ST(&(x[WS(ios, 31)]), T5j, dist, &(x[WS(ios, 1)]));
		    T53 = VSUB(T51, T4Q);
		    ST(&(x[WS(ios, 47)]), T53, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T56, T5e, T5f, T57;
		    T56 = VADD(T54, T55);
		    ST(&(x[WS(ios, 15)]), T56, dist, &(x[WS(ios, 1)]));
		    T5e = VSUB(T5a, T5d);
		    ST(&(x[WS(ios, 63)]), T5e, dist, &(x[WS(ios, 1)]));
		    T5f = VADD(T5a, T5d);
		    ST(&(x[WS(ios, 1)]), T5f, dist, &(x[WS(ios, 1)]));
		    T57 = VSUB(T55, T54);
		    ST(&(x[WS(ios, 49)]), T57, dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T6U, T7m, T7n, T77, T7a, T7g, T7j, T7b;
	       {
		    V T6k, T7h, T76, T7i, T6T, T7e, T73, T7f;
		    {
			 V T64, T6j, T74, T75;
			 T64 = VSUB(T5W, T63);
			 T6j = VSUB(T6b, T6i);
			 T6k = VSUB(T64, T6j);
			 T7h = VADD(T64, T6j);
			 T74 = VFNMS(LDK(KP555570233), T6v, VMUL(LDK(KP831469612), T6A));
			 T75 = VFMA(LDK(KP555570233), T6M, VMUL(LDK(KP831469612), T6R));
			 T76 = VSUB(T74, T75);
			 T7i = VADD(T74, T75);
		    }
		    {
			 V T6B, T6S, T6X, T72;
			 T6B = VFMA(LDK(KP831469612), T6v, VMUL(LDK(KP555570233), T6A));
			 T6S = VFNMS(LDK(KP555570233), T6R, VMUL(LDK(KP831469612), T6M));
			 T6T = VSUB(T6B, T6S);
			 T7e = VADD(T6B, T6S);
			 T6X = VSUB(T6V, T6W);
			 T72 = VSUB(T6Y, T71);
			 T73 = VSUB(T6X, T72);
			 T7f = VADD(T72, T6X);
		    }
		    T6U = VADD(T6k, T6T);
		    T7m = VBYI(VADD(T7f, T7e));
		    T7n = VADD(T7h, T7i);
		    T77 = VBYI(VADD(T73, T76));
		    T7a = VSUB(T6k, T6T);
		    T7g = VBYI(VSUB(T7e, T7f));
		    T7j = VSUB(T7h, T7i);
		    T7b = VBYI(VSUB(T76, T73));
	       }
	       {
		    V T78, T7o, T7p, T79;
		    T78 = VSUB(T6U, T77);
		    ST(&(x[WS(ios, 54)]), T78, dist, &(x[0]));
		    T7o = VADD(T7m, T7n);
		    ST(&(x[WS(ios, 6)]), T7o, dist, &(x[0]));
		    T7p = VSUB(T7n, T7m);
		    ST(&(x[WS(ios, 58)]), T7p, dist, &(x[0]));
		    T79 = VADD(T6U, T77);
		    ST(&(x[WS(ios, 10)]), T79, dist, &(x[0]));
	       }
	       {
		    V T7c, T7k, T7l, T7d;
		    T7c = VSUB(T7a, T7b);
		    ST(&(x[WS(ios, 42)]), T7c, dist, &(x[0]));
		    T7k = VADD(T7g, T7j);
		    ST(&(x[WS(ios, 26)]), T7k, dist, &(x[0]));
		    T7l = VSUB(T7j, T7g);
		    ST(&(x[WS(ios, 38)]), T7l, dist, &(x[0]));
		    T7d = VADD(T7a, T7b);
		    ST(&(x[WS(ios, 22)]), T7d, dist, &(x[0]));
	       }
	  }
	  {
	       V T40, T4m, T4n, T47, T4a, T4g, T4j, T4b;
	       {
		    V T3S, T4h, T46, T4i, T3Z, T4e, T43, T4f;
		    {
			 V T3Q, T3R, T44, T45;
			 T3Q = VADD(Tg, TD);
			 T3R = VADD(T3f, T3g);
			 T3S = VSUB(T3Q, T3R);
			 T4h = VADD(T3Q, T3R);
			 T44 = VFNMS(LDK(KP290284677), T3T, VMUL(LDK(KP956940335), T3U));
			 T45 = VFMA(LDK(KP290284677), T3W, VMUL(LDK(KP956940335), T3X));
			 T46 = VSUB(T44, T45);
			 T4i = VADD(T44, T45);
		    }
		    {
			 V T3V, T3Y, T41, T42;
			 T3V = VFMA(LDK(KP956940335), T3T, VMUL(LDK(KP290284677), T3U));
			 T3Y = VFNMS(LDK(KP290284677), T3X, VMUL(LDK(KP956940335), T3W));
			 T3Z = VSUB(T3V, T3Y);
			 T4e = VADD(T3V, T3Y);
			 T41 = VADD(T13, T1s);
			 T42 = VADD(T3r, T3k);
			 T43 = VSUB(T41, T42);
			 T4f = VADD(T42, T41);
		    }
		    T40 = VADD(T3S, T3Z);
		    T4m = VBYI(VADD(T4f, T4e));
		    T4n = VADD(T4h, T4i);
		    T47 = VBYI(VADD(T43, T46));
		    T4a = VSUB(T3S, T3Z);
		    T4g = VBYI(VSUB(T4e, T4f));
		    T4j = VSUB(T4h, T4i);
		    T4b = VBYI(VSUB(T46, T43));
	       }
	       {
		    V T48, T4o, T4p, T49;
		    T48 = VSUB(T40, T47);
		    ST(&(x[WS(ios, 51)]), T48, dist, &(x[WS(ios, 1)]));
		    T4o = VADD(T4m, T4n);
		    ST(&(x[WS(ios, 3)]), T4o, dist, &(x[WS(ios, 1)]));
		    T4p = VSUB(T4n, T4m);
		    ST(&(x[WS(ios, 61)]), T4p, dist, &(x[WS(ios, 1)]));
		    T49 = VADD(T40, T47);
		    ST(&(x[WS(ios, 13)]), T49, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T4c, T4k, T4l, T4d;
		    T4c = VSUB(T4a, T4b);
		    ST(&(x[WS(ios, 45)]), T4c, dist, &(x[WS(ios, 1)]));
		    T4k = VADD(T4g, T4j);
		    ST(&(x[WS(ios, 29)]), T4k, dist, &(x[WS(ios, 1)]));
		    T4l = VSUB(T4j, T4g);
		    ST(&(x[WS(ios, 35)]), T4l, dist, &(x[WS(ios, 1)]));
		    T4d = VADD(T4a, T4b);
		    ST(&(x[WS(ios, 19)]), T4d, dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T5u, T5Q, T5R, T5B, T5E, T5K, T5N, T5F;
	       {
		    V T5q, T5J, T5A, T5M, T5t, T5L, T5x, T5I;
		    {
			 V T5m, T5p, T5y, T5z;
			 T5m = VFNMS(LDK(KP634393284), T5l, VMUL(LDK(KP773010453), T5k));
			 T5p = VFMA(LDK(KP773010453), T5n, VMUL(LDK(KP634393284), T5o));
			 T5q = VSUB(T5m, T5p);
			 T5J = VADD(T5m, T5p);
			 T5y = VFMA(LDK(KP634393284), T5k, VMUL(LDK(KP773010453), T5l));
			 T5z = VFNMS(LDK(KP634393284), T5n, VMUL(LDK(KP773010453), T5o));
			 T5A = VSUB(T5y, T5z);
			 T5M = VADD(T5y, T5z);
		    }
		    {
			 V T5r, T5s, T5v, T5w;
			 T5r = VSUB(T4U, T4V);
			 T5s = VSUB(T4N, T4M);
			 T5t = VSUB(T5r, T5s);
			 T5L = VADD(T5s, T5r);
			 T5v = VSUB(T4R, T4S);
			 T5w = VSUB(T4H, T4K);
			 T5x = VSUB(T5v, T5w);
			 T5I = VADD(T5v, T5w);
		    }
		    T5u = VBYI(VSUB(T5q, T5t));
		    T5Q = VSUB(T5I, T5J);
		    T5R = VBYI(VSUB(T5M, T5L));
		    T5B = VSUB(T5x, T5A);
		    T5E = VBYI(VADD(T5t, T5q));
		    T5K = VADD(T5I, T5J);
		    T5N = VBYI(VADD(T5L, T5M));
		    T5F = VADD(T5x, T5A);
	       }
	       {
		    V T5C, T5S, T5T, T5D;
		    T5C = VADD(T5u, T5B);
		    ST(&(x[WS(ios, 23)]), T5C, dist, &(x[WS(ios, 1)]));
		    T5S = VSUB(T5Q, T5R);
		    ST(&(x[WS(ios, 39)]), T5S, dist, &(x[WS(ios, 1)]));
		    T5T = VADD(T5Q, T5R);
		    ST(&(x[WS(ios, 25)]), T5T, dist, &(x[WS(ios, 1)]));
		    T5D = VSUB(T5B, T5u);
		    ST(&(x[WS(ios, 41)]), T5D, dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T5G, T5O, T5P, T5H;
		    T5G = VADD(T5E, T5F);
		    ST(&(x[WS(ios, 9)]), T5G, dist, &(x[WS(ios, 1)]));
		    T5O = VSUB(T5K, T5N);
		    ST(&(x[WS(ios, 57)]), T5O, dist, &(x[WS(ios, 1)]));
		    T5P = VADD(T5K, T5N);
		    ST(&(x[WS(ios, 7)]), T5P, dist, &(x[WS(ios, 1)]));
		    T5H = VSUB(T5F, T5E);
		    ST(&(x[WS(ios, 55)]), T5H, dist, &(x[WS(ios, 1)]));
	       }
	  }
     }
     END_SIMD();
     return W;
}

static const tw_instr twinstr[] = {
     VTW(1),
     VTW(2),
     VTW(3),
     VTW(4),
     VTW(5),
     VTW(6),
     VTW(7),
     VTW(8),
     VTW(9),
     VTW(10),
     VTW(11),
     VTW(12),
     VTW(13),
     VTW(14),
     VTW(15),
     VTW(16),
     VTW(17),
     VTW(18),
     VTW(19),
     VTW(20),
     VTW(21),
     VTW(22),
     VTW(23),
     VTW(24),
     VTW(25),
     VTW(26),
     VTW(27),
     VTW(28),
     VTW(29),
     VTW(30),
     VTW(31),
     VTW(32),
     VTW(33),
     VTW(34),
     VTW(35),
     VTW(36),
     VTW(37),
     VTW(38),
     VTW(39),
     VTW(40),
     VTW(41),
     VTW(42),
     VTW(43),
     VTW(44),
     VTW(45),
     VTW(46),
     VTW(47),
     VTW(48),
     VTW(49),
     VTW(50),
     VTW(51),
     VTW(52),
     VTW(53),
     VTW(54),
     VTW(55),
     VTW(56),
     VTW(57),
     VTW(58),
     VTW(59),
     VTW(60),
     VTW(61),
     VTW(62),
     VTW(63),
     {TW_NEXT, VL, 0}
};

static const ct_desc desc = { 64, "t1bv_64", twinstr, {467, 198, 52, 0}, &GENUS, 0, 0, 0 };

void X(codelet_t1bv_64) (planner *p) {
     X(kdft_dit_register) (p, t1bv_64, &desc);
}
