//
//! \file libecc/polynomial.h
//! \brief Field of Polynomials with binairy coefficents and fixed reduction polynomial.
//!
//! This header file declares <code>template<unsigned int m, unsigned int k> class libecc::polynomial</code>,
//! representing the polynomials with binairy coefficients, of a finite field with fixed reduction polynomial
//! \f$t^m + t^k + 1 = 0\f$.
//
// This file is part of the libecc package.
// Copyright (C) 2002, by
//
// Carlo Wood, Run on IRC <carlo@alinoe.com>
// RSA-1024 0x624ACAD5 1997-01-26                    Sign & Encrypt
// Fingerprint16 = 32 EC A7 B6 AC DB 65 A6  F6 F6 55 DD 1C DC FF 61
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//

#ifndef LIBECC_POLYNOMIAL_H
#define LIBECC_POLYNOMIAL_H

#include <libecc/bitset.h>
#if ECC_DEBUGOUTPUT
#include <libcw/cwprint.h>
#endif

namespace libecc {

/**\class polynomial
 * \brief Polynomial representation of the Galois Field \f$GF(2^m)/t^m+t^k+1\f$.
 *
 * This class represents a polynomial with binairy coefficients (0 or 1)
 * for a finite field with the fixed reduction polynomial: \f$t^m + t^k + 1 = 0\f$.
 *
 * <H5>Theory</H5>
 *
 * The well known complex number \f$i\f$ is defined by the equation \f$i^2+1 = 0\f$.&nbsp;
 * Likewise, it is possible to define a \f$t\f$ by means of a so called reduction polynomial: \f$t^m + t^k + 1 = 0\f$.&nbsp;
 * In order to be sure that all roots of this equation are of degree \f$m\f$, this polynomial must be irreducible.&nbsp;
 * Irreducible means that it is not possible to write the polynomial as a product of two polynomials of lesser degree.&nbsp;
 * For example, consider the equation
 * \f$t^8 + t^6 + t^5 + t^3 + t^2 + 1 = 0.\ \f$
 * Then \f$t\f$ isn't very special as it could still be an ordinary complex number:
 * \f$t^8 + t^6 + t^5 + t^3 + t^2 + 1 = (t^6 + t^3 + 1) (t^2 + 1)\f$ which has, among others, \f$\pm i\f$ as roots.
 *
 * Apart from the reduction polynomial, the field elements are further restricted to polynomials with binairy coefficients.&nbsp;
 * This is reflected in the fact that \f$a t^n + b t^n = (a + b \pmod{2}) t^n\f$, in other words that \f$t^n + t^n = 0\f$.&nbsp;
 * As a result, all polynomials will be of a degree less than \f$m\f$ and the total number of different polynomials will be finite.&nbsp;
 *
 * [ Note: if the coefficients of a given polynomial F(t) are done modulo some integer p, then we say: the polynomial is
 * defined <i>over</i> GF(p).&nbsp; GF stands for Galois Field and means that it is a finite field (of p elements thus).&nbsp;
 * The characteristics of a finite field are completely determined by the number of its elements, hence that p is also called
 * 'the characteristic' of the field.&nbsp;
 * For any two field representations with the same characteristic, it is possible to map the elements of the two representations
 * one on one upon eachother without loss of generality.&nbsp;
 * The complex number i, as defined by \f$i^2+1 = 0\f$, is <em>not</em> defined over GF(2) of course.&nbsp;
 * The solutions of t of \f$t^2+1 = 0\f$ over GF(2) are +1, -1, i and -i. ]
 *
 * <b>Example</b>
 *
 * Let the reduction polynomial be of degree \f$m = 4\f$.<br>
 * Then the total number of field elements is \f$2^4 = 16\f$:
 *
 * \f$0 \cdot t^3 + 0 \cdot t^2 + 0 \cdot t^1 + 0 \cdot t^0 = 0\f$<br>
 * \f$0 \cdot t^3 + 0 \cdot t^2 + 0 \cdot t^1 + 1 \cdot t^0 = 1\f$<br>
 * \f$0 \cdot t^3 + 0 \cdot t^2 + 1 \cdot t^1 + 0 \cdot t^0 = t\f$<br>
 * \f$0 \cdot t^3 + 0 \cdot t^2 + 1 \cdot t^1 + 1 \cdot t^0 = t + 1\f$<br>
 * \f$0 \cdot t^3 + 1 \cdot t^2 + 0 \cdot t^1 + 0 \cdot t^0 = t^2\f$<br>
 * \f$0 \cdot t^3 + 1 \cdot t^2 + 0 \cdot t^1 + 1 \cdot t^0 = t^2 + 1\f$<br>
 * \f$0 \cdot t^3 + 1 \cdot t^2 + 1 \cdot t^1 + 0 \cdot t^0 = t^2 + t\f$<br>
 * \f$0 \cdot t^3 + 1 \cdot t^2 + 1 \cdot t^1 + 1 \cdot t^0 = t^2 + t + 1\f$<br>
 * \f$1 \cdot t^3 + 0 \cdot t^2 + 0 \cdot t^1 + 0 \cdot t^0 = t^3\f$<br>
 * \f$1 \cdot t^3 + 0 \cdot t^2 + 0 \cdot t^1 + 1 \cdot t^0 = t^3 + 1\f$<br>
 * \f$1 \cdot t^3 + 0 \cdot t^2 + 1 \cdot t^1 + 0 \cdot t^0 = t^3 + t\f$<br>
 * \f$1 \cdot t^3 + 0 \cdot t^2 + 1 \cdot t^1 + 1 \cdot t^0 = t^3 + t + 1\f$<br>
 * \f$1 \cdot t^3 + 1 \cdot t^2 + 0 \cdot t^1 + 0 \cdot t^0 = t^3 + t^2\f$<br>
 * \f$1 \cdot t^3 + 1 \cdot t^2 + 0 \cdot t^1 + 1 \cdot t^0 = t^3 + t^2 + 1\f$<br>
 * \f$1 \cdot t^3 + 1 \cdot t^2 + 1 \cdot t^1 + 0 \cdot t^0 = t^3 + t^2 + t\f$<br>
 * \f$1 \cdot t^3 + 1 \cdot t^2 + 1 \cdot t^1 + 1 \cdot t^0 = t^3 + t^2 + t + 1\f$
 *
 * Polynomials of a higher degree do not exist because we can always
 * write those as a polynomial of degree 3 or less.&nbsp;
 * For example, let the (irreducible) reduction polynomial of degree 4 be \f$t^4+t+1=0\f$.&nbsp;
 * Now suppose \f$t^5\f$ would be a field element, then we can simply prove
 * that this polynomial is equivalent with one of the sixteen listed above by using the
 * reduction polynomial: \f$t^5 = t \cdot t^4 = t (t^4 + 0) = t (t^4 + t^4 + t + 1) = t (t + 1) = t^2 + t\f$.
 *
 * The polynomials can thus be represented by a bitset of \f$m\f$ bits.&nbsp;
 * Each bit in turn then represents a coefficient of the polynomial.&nbsp;
 * This is very convenient for a computer.
 *
 * Furthermore, because the algebra of the coefficients of the polynomials has to
 * be done modulo two (as per our definition), addition (and substraction) of two polynomials is equivalent
 * to the exclusive-or of those bitsets.&nbsp;
 * For example, \f$(t^2 + t) + (t^2 + 1) =t + 1\f$, which is equivalent with 0110 xor 0101 = 0011.
 *
 * Also multiplication can be implemented rather easily.&nbsp;
 * Because \f$t^p (t^2 + 1) = t^{2+p} + t^p\f$
 * it is obvious that multiplication with a single power of \f$t\f$ is just a left shift.&nbsp;
 * We <em>do</em> have to take into account the reduction polynomial though - when the degree of the resulting
 * polynomial becomes too large.
 *
 * Multiplication of two arbitrary polynomials exists of a few shifts and exclusive-or
 * operations, followed by reduction with the reduction polynomial if needed.
 *
 * For example,
 *
 * \f$t^4+t+1 \equiv\f$ 10011 (reduction polynomial)<br>
 * \f$t^3 + t^2 + 1 \equiv\f$ 1101<br>
 * \f$t^2 + t + 1 \equiv\f$ 0111
 *
 * <pre>
 *     1101
 *     0111 *
 *     ------
 *     1101
 *    11010
 *   110100 +  (addition is exclusive-or!)
 *     ------
 *   100011
 *   10011  -  (t * reduction_polynomial)
 *     ------
 *      101
 * </pre>
 * And thus \f$(t^3 + t^2 + 1) (t^2 + t + 1) = t^2 + 1\f$.
 *
 * The reduction polynomial could be any irreducible polynomial, but by using a trinomial
 * (a polynomial with only three terms), reduction can be implemented much faster.&nbsp;
 * The power of the second term (\f$k\f$) should be chosen as small as possible because
 * also that will speed up the process of reduction.&nbsp;
 * Therefore, for a given \f$m\f$, one should use the smallest \f$k\f$
 * for which \f$t^m + t^k + 1\f$ is irreducible.
 *
 * The reason that the reduction polynomial has to be irreducible is to ensure that the
 * elements of the field indeed form a <a href="http://planetmath.org/encyclopedia/Field.html">Field</a>.
 * First of all, multiplication of field elements should be commutative, that is \f$ab = ba\f$
 * for any \f$a\f$ and \f$b\f$ element of the field.
 * Now suppose that we would use a reduction polynomial that could be written as \f$P \times Q\f$
 * where the (irreducible) polynomials \f$P\f$ and \f$Q\f$ are of respectively degrees \f$p\f$
 * and \f$q\f$ with \f$0 < p \leq q < p+q = m\f$.
 * Then \f$P\f$ and \f$Q\f$ are elements of the "field" (their degree is less than \f$m\f$).
 * Furthermore, there should exist a \f$R\f$ such that \f$R \times P = 1\f$ (\f$R\f$ is the
 * multiplicative inverse of \f$P\f$).&nbsp;
 * Then note that \f$Q = (R \times P) \times Q \neq R \times (P \times Q) = 0\f$!&nbsp;
 * If the reduction polynomial is not irreducible then we don't <em>have</em> a field thus!
 *
 * <b>Finding an irreducible trinomial</b>
 *
 * An irreducible trinomial is a polynomial of three terms, \f$F(t) = t^m + t^k + 1\f$
 * which cannot be written as the product of two polynomials of lower degree.&nbsp;
 * Finding an irreducible polynomial is based on the theorem that \f$t^{(2^m)} - t\f$
 * is the product of all irreducible polynomials whose degree divides m.
 *
 * When \f$m\f$ is prime then all we need to do in order to find the smallest \f$k\f$ is trying all values of
 * \f$k\f$, starting at \f$1\f$, until we find that \f$t^{(2^m)}-t=0 \pmod{F(t)}\f$, or in other words
 * that \f$t^{(2^m-1)} = 1 \pmod{F(t)}\f$.&nbsp;
 * If it equals \f$1\f$ then the trinomial \f$F(t)\f$ is irreducible.&nbsp;
 * [ When \f$m\f$ is not prime then we also need to check that for each divisor \f$d\f$ of \f$m\f$
 * (\f$1 \leq d < m\f$) that \f$\gcd(F(t), t^{(2^d)}-t) = 1\f$ ].
 *
 * More in general, let \f$a\f$ be a polynomial element of the field, and let \f$n\f$ be the
 * smallest positive integer for which \f$a^n=1 \pmod{F(t)}\f$, then
 * \f$n\f$ is called the <em>order</em> of \f$a\f$.&nbsp;
 * All the polynomials \f$a^j\f$ that can be formed by running \f$j\f$ over the values \f$0\f$ till \f$n-1\f$:
 * \f$\{ 1, a, a^2, a^3, ... a^{n-1} \}\f$
 * form a group of \f$n\f$ polynomials that can be <em>generated</em> by \f$a\f$.
 *
 * A polynomial of order \f$2^m-1\f$ is called a <em>generator</em> as it generates <em>all</em>
 * elements of the field except \f$0\f$.&nbsp;
 * If \f$t\f$ is generator of a given polynomial field over \f$Z_2[t]/(F(t))\f$, then the reduction polynomial \f$F(t)\f$
 * is called <em>primitive</em>, which is more restrictive than irreducible.&nbsp; The chance that an arbitrary irreducible
 * polynomial of prime degree is also primitive is very likely (and for large m, even for non-primes), by far most
 * irreducible trinomials of prime degree are also primitive.
 *
 * This is a nice moment to note the equivalence with the Random Number Generator (see \ref libecc::rng).&nbsp;
 * What we have there is a shift register with feedback points using exclusive-or; this is exactly equivalent
 * with multiplying with t and using a reduction polynomial!&nbsp;
 * Consider a RNG with 3 bits and a feedback point <em>to</em> bit 0 and 1:
 *
 * <table>
 * <tr>
 * <td>RNG state</td>
 * <td>&nbsp;</td>
 * <td>Polynomial</td>
 * <td>&nbsp;</td>
 * <td>Reduction of \f$t^n\f$ with \f$t^3+t+1=0\f$</td>
 * <td></td>
 * </tr><tr>
 * <td><tt>001</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$1\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^0\f$</td>
 * </tr><tr>
 * <td><tt>010</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$t\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^1\f$</td>
 * </tr><tr>
 * <td><tt>100</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$t^2\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^2\f$</td>
 * </tr><tr>
 * <td><tt>011</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$t+1\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^3+(t^3+t+1)=t+1\f$</td>
 * </tr><tr>
 * <td><tt>110</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$t^2+t\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^4+(t^3+t+1)t=t^4+(t^4+t^2+t)=t^2+t\f$</td>
 * </tr><tr>
 * <td><tt>111</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$t^2+t+1\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^5+(t^3+t+1)(t^2+1)=t^5+(t^5+t^3+t^2)+(t^3+t+1)=t^2+t+1\f$</td>
 * </tr><tr>
 * <td><tt>101</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$t^2+1\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^6+(t^3+t+1)(t^3+t+1)=t^6+(t^6+t^4+t^3)+(t^4+t^2+t)+(t^3+t+1)=t^2+1\f$</td>
 * </tr><tr>
 * <td><tt>001</tt></td>
 * <td>\f$\equiv\f$</td>
 * <td>\f$1\f$</td>
 * <td>\f$=\f$</td>
 * <td>\f$t^7+(t^3+t+1)(t^4+t^2+t+1)=1\f$</td>
 * </tr>
 * </table>
 *
 * The state of the random number generator can be seen as a polynomial over \f$Z_2\f$ with a reduction polynomial
 * that is determined by the feedback points.&nbsp; Each step then corresponds with multiplying the polynomial with \f$t\f$.&nbsp;
 * The period of the RNG will be equal to the order of \f$t\f$, having a maximum value of \f$2^m-1\f$ when the
 * reduction polynomial is primitive.
 *
 * In order to check if a given irreducible polynomial is primitive, one needs to know the factorization of \f${2^m}-1\f$.
 * After all, even while \f$t^{({2^m}-1)}=1 \pmod{F(t)}\f$, then \f${2^m}-1\f$ is not necessarily the smallest positive
 * integer \f$n\f$ for which \f$t^n = 1 \pmod{F(t)}\f$.&nbsp;
 * For example, when \f$t^{({2^m}-1)} = p_1 p_2 p_3\f$ and the order of \f$t\f$ is \f$p_1 p_3\f$ so
 * that \f$t^{(p_1 p_3)} = 1 \pmod{F(t)}\f$, then also
 * \f$t^{({2^m}-1)} = t^{(p_1 p_2 p_3)} = (t^{(p_1 p_3)})^{p_2} = 1^{p_2} = 1 \pmod{F(t)}\f$.
 * Therefore, one must make sure that every possible fraction of \f${2^m}-1\f$ is not the real order.
 *
 * Using the factorization tables of the <a href="http://www.cerias.purdue.edu/homes/ssw/cun/">Cunningham Project</a>,
 * we have calculated a list of <a href="../external/TRINOMIALS">primitive trinomials</a>.&nbsp;
 * The program that was used for this can be found in <code>testsuite/polynomial</code>.
 *
 * <b>Further reading</b>
 *
 * See also <a href="http://www.certicom.com/resources/ecc/math8.html">http://www.certicom.com/resources/ecc/math8.html</a>
 * and chapter 4.5 of <a href="http://www.cacr.math.uwaterloo.ca/hac/">The Handbook Of Applied Cryptography</a>.
 *
 * More information on primitive trinomials and Random Number Generators can be found
 * on <A HREF="http://web.comlab.ox.ac.uk/oucl/work/richard.brent/trinom.html">Richard Brent's page</A>.&nbsp;
 * There you can also find links to sites about <A HREF="http://web.comlab.ox.ac.uk/oucl/work/richard.brent/factors.html">factorization</A>.
 */
template<unsigned int m, unsigned int k>
  class polynomial {
    public:
      // Fix this if you add members in front of M_coefficients.
      static size_t const offsetof_vector = bitset<m>::offsetof_vector;

    private:
      bitset<m> M_coefficients;
      static polynomial<m, k> const one;

    public:
      /**
       * Returns the multiplicative unity for the field (1).
       */
      static polynomial const& unity(void) { return one; }

    public:
      /**
       * Construct an uninitialized field element.
       */
      polynomial(void) { }

      /**
       * Construct a polynomial of a low degree.
       */
      explicit polynomial(bitset_digit_t coefficients) : M_coefficients(coefficients) { }

      /**
       * Copy constructor.
       */
      polynomial(polynomial const& p) : M_coefficients(p.M_coefficients) { }

      /**
       * Construct a polynomial with binairy coefficients given by the bitset \a coefficients.
       */
      explicit polynomial(bitset<m> const& coefficients) : M_coefficients(coefficients) { }

      /**
       * Construct a polynomial with binairy coefficients given by the hexadecimal value in the string \a coefficients.
       */
      polynomial(std::string const& coefficients) : M_coefficients(coefficients) { }

      /**
       * Construct a polynomial that is the sum of two other polynomials.
       * This constructor is for implicit conversions like in
       *
       * \code
       * polynomial<m, k> p1("1"), p2("2"), p3("3");
       *
       * p1 = p2 * (p3 + p1);
       * \endcode
       *
       * Here this constructor is used to implicitely convert <code>(p3 + p1)</code> to a polynomial, because
       * <code>operator*</code> only takes a \c polynomial and not a \c bitsetExpression.
       */
      polynomial(Operator::bitsetExpression<m, false, false, Operator::bitsetXOR> const& expression) : M_coefficients(expression) { }

      /**
       * Assignment operator, set this polynomial equal to \a p.
       */
      polynomial& operator=(polynomial const& p) { M_coefficients = p.M_coefficients; return *this; }

      /**
       * Assignment operator, set the coefficients of this polynomial equal to \a coefficients.
       */
      polynomial& operator=(bitset<m> const& coefficients) { M_coefficients = coefficients; return *this; }

      /**
       * Add two polynomials and assign the result to this polynomial.
       * This operator is used when doing <code>p1 = p2 + p3;</code>.
       */
      polynomial& operator=(Operator::bitsetExpression<m, false, false, Operator::bitsetXOR> const& expression);

      /**
       * Construct a polynomial \f$x\f$ that is the solution to the quadratic equation \f$x^2 + b \cdot x = c\f$.
       */
      polynomial(polynomial const& b, polynomial const& c);

      /**
       * Number of digits needed for temporary buffer used to calculate a square.
       */
      static unsigned int const square_digits = 2 * bitset_base<m>::digits + 4;

      /**
       * Calculate the square of this polynomial.
       *
       * The result is written into \a tmpbuf which is returned, casted to a polynomial.
       * Usage:
       *
       * \code
       * libecc:polynomial<m, k> p1(init);
       * libecc::bitset_digit_t p2buf[libecc:polynomial<m, k>::square_digits];
       * libecc:polynomial<m, k>& p2 = p1.square(p2buf);	// p2 becomes the square of p1.
       * \endcode
       *
       * <code>p2buf</code> should not be destructed until the reference <code>p2</code>
       * will not be used anymore.
       */
      polynomial& square(bitset_digit_t* tmpbuf) const;	// tmpbuf must be an array of `square_digits' bitset_digit_t.

      /**
       * Calculate the square root of this polynomial.
       */
      bool sqrt(void);

      // The field arithmetic is implemented in terms of operations on the bits.
      /**
       * Add polynomial \a p to this polynomial.
       */
      polynomial& operator+=(polynomial const& p) { M_coefficients ^= p.M_coefficients; return *this; }

      /**
       * Subtract polynomial \a p from this polynomial.  Has the same effect as adding it.
       */
      polynomial& operator-=(polynomial const& p) { M_coefficients ^= p.M_coefficients; return *this; }

      /**
       * Multiply this polynomial with polynomial \a p.
       */
      polynomial& operator*=(polynomial const& p);

      /**
       * Devide this polynomial by polynomial \a p.
       */
      polynomial& operator/=(polynomial const& p);

      /**
       * Prepare the addition of two polynomials.
       *
       * Returns a dummy object with two pointers to the polynomials to be added.
       * The addition does not take place until the final destination polynomial is known as well.
       */
      friend Operator::bitsetExpression<m, false, false, Operator::bitsetXOR> operator+ <>(polynomial const& p1, polynomial const& p2);

      /**
       * Prepare the subtraction of two polynomials.
       *
       * Returns a dummy object with two pointers to the polynomials to be subtracted.
       * The subtraction does not take place until the final destination polynomial is known as well.
       */
      friend Operator::bitsetExpression<m, false, false, Operator::bitsetXOR> operator- <>(polynomial const& p1, polynomial const& p2);

      /**
       * Multiply polynomial \a p1 with polynomial \a p2.
       */
      friend polynomial operator* <>(polynomial const& p1, polynomial const& p2);

      /**
       * Devide polynomial \a p1 by polynomial \a p2.
       */
      friend polynomial operator/ <>(polynomial const& p1, polynomial const& p2);

      /**
       * Compare two polynomials.  Returns <code>true</code> when \a p1 == \a p2, <code>false</code> otherwise.
       */
      friend bool operator== <>(polynomial const& p1, polynomial const& p2);

      /**
       * Compare two polynomials.  Returns <code>true</code> when \a p1 != \a p2, <code>false</code> otherwise.
       */
      friend bool operator!= <>(polynomial const& p1, polynomial const& p2);

      /**
       * Write the binary coefficients of the polynomial \a p to \a os.
       *
       * For example, an output string <code>1001101</code> means \f$t^6 + t^3 + t^2 + 1\f$.
       */
      friend std::ostream& operator<< <>(std::ostream& os, polynomial const& p);

      /**
       * Return the underlaying bitset representing the coefficients.
       */
      bitset<m> const& get_bitset(void) const { return M_coefficients; }

      /**
       * Return the underlaying bitset representing the coefficients.
       */
      bitset<m>& get_bitset(void) { return M_coefficients; }

    private:
      static void reduce(bitset_digit_t* buf);
      static bitset_digit_t reducea(bitset_digit_t const* a, bitset_digit_t* b, bitset_digit_t* c);

      void multiply_with(polynomial const& p1, bitset<m>& result) const;
  };

template<unsigned int m, unsigned int k>
  polynomial<m, k> const polynomial<m, k>::one(1);

template<unsigned int m, unsigned int k>
  bool polynomial<m, k>::sqrt(void)
  {
    bitset<m> highbits;
    highbits.reset();

    // First convert all odd powers into even powers
    if ((m & 1) == 1)
    {
      if ((k & 1) == 1)		// m and k are odd?
      {
	for(unsigned int bit = 1; bit < m; bit += 2)
	{
	  if (M_coefficients.test(bit))
	  {
	    if (bit >= m - k)
	      highbits.flip(bit + k - m);
	    else
	      M_coefficients.flip(bit + k);
	    highbits.flip(bit);
	  }
	}
      }
      else			// m is odd and k is even
      {
	for(unsigned int bit = 1; bit < m; bit += 2)
	{
	  if (M_coefficients.test(bit))
	  {
	    if (bit >= m - k)
	    {
	      M_coefficients.flip(bit + 2 * k - m);
	      M_coefficients.flip(bit + k - m);
	    }
	    else
	      M_coefficients.flip(bit + k);
	    highbits.flip(bit);
	  }
	}
      }
    }
    else if ((k & 1) == 1)	// m is even and k is odd
    {
      for(unsigned int bit = 1; bit < m; bit += 2)
      {
	if (M_coefficients.test(bit))
	{
	  if (bit < k)
	  {
	    M_coefficients.flip(bit + k);
	    M_coefficients.flip(bit + m - k);
	    highbits.flip(bit + m - k);
	  }
	  else
	  {
	    M_coefficients.flip(bit - k);
	    highbits.flip(bit - k);
	  }
	}
      }
    }
    else                  	// m and k are both even (actually, this should never be used as reduction polynomial).
    {
      for(unsigned int bit = 1; bit < m; bit += 2)
	if (M_coefficients.test(bit))
	  return false;		// This can't be a square
    }

    // Next handle the remaining even powers
    unsigned int bit_to = 1;
    for(unsigned int bit = 2; bit < m; bit += 2)
    {
      if (M_coefficients.test(bit))
	M_coefficients.set(bit_to);
      else
	M_coefficients.clear(bit_to);
      ++bit_to;
    }
    for(unsigned int bit = m % 2; bit < m; bit += 2)
    {
      if (highbits.test(bit))
	M_coefficients.set(bit_to);
      else
	M_coefficients.clear(bit_to);
      ++bit_to;
    }
    return true;
  }

template<unsigned int m, unsigned int k>
  polynomial<m, k>&
  polynomial<m, k>::operator*=(polynomial const& e1)
  {
    multiply_with(e1, M_coefficients);
    return *this;
  }

template<unsigned int m, unsigned int k>
  inline polynomial<m, k>&
  polynomial<m, k>::operator=(Operator::bitsetExpression<m, false, false, Operator::bitsetXOR> const& expression)
  {
    M_coefficients = expression;
    return *this;
  }

template<unsigned int m, unsigned int k>
  void
  polynomial<m, k>::multiply_with(polynomial const& p1, bitset<m>& result) const
  {
    bitset_digit_t output[bitset<m>::digits * 2] __attribute__ ((aligned (8)));

    // Find the first non-zero digit in the input polynomial of this object.
    unsigned int digit = 0;
    while(M_coefficients.digit(digit) == 0)		// Still zero?
    {
      output[digit] = 0;				// That means that the output will end on zero too.
      if (++digit == bitset<m>::digits)
      {
	result.reset();					// The whole polynomial is zero, the result will be zero too.
	return;
      }
    }
    unsigned int uninitialized_digit = digit;		// The next digit of `output' that has not yet been initialized.
    // Find the first digit in the input polynomial of this object whose first bit is set.
    for(; digit < bitset<m>::digits; ++digit)
    {
      if ((M_coefficients.digit(digit) & 1))		// Is the first bit set?
      {
	// Set the output to p1 times this bit.
	for (unsigned int d = 0; d < bitset<m>::digits; ++d)
	  output[d + digit] = p1.get_bitset().digit(d);
	uninitialized_digit = bitset<m>::digits + digit;
	++digit;					// Set to the next input digit.
	break;
      }
      output[digit] = 0;				// Initialize this digit of the output to 0.
      ++uninitialized_digit;
    }
    // Set the remaining digits to zero, if any.
    for(unsigned int remaining_digit = uninitialized_digit; remaining_digit < sizeof(output) / sizeof(bitset_digit_t); ++remaining_digit)
      output[remaining_digit] = 0;
    // Find for the remaining input digits the ones that have their first bit set.
    for(; digit < bitset<m>::digits; ++digit)
      if ((M_coefficients.digit(digit) & 1))		// Is the first bit set?
      {
	// Add p1 times this bit to the output.
	for (unsigned int d = 0; d < bitset<m>::digits; ++d)
	  output[d + digit] ^= p1.get_bitset().digit(d);
      }
    // Create a bitset that will contain p1, shifted at most bitset_digit_bits - 1 to the left.
    bitset<m + bitset_digit_bits - 1> shifted_p1;
    // Start with having it shifted 1 bit to the left.
    bitset_digit_t carry = 0;
    unsigned int d = 0;
    for(bitset_digit_t const* ptr = p1.get_bitset().digits_ptr(); ptr < p1.get_bitset().digits_ptr() + bitset<m>::digits; ++ptr, ++d)
    {
      shifted_p1.rawdigit(d) = (*ptr << 1) | carry;
      carry = *ptr >> (8 * sizeof(bitset_digit_t) - 1);
    }
    if (d < bitset<m + bitset_digit_bits - 1>::digits)
      shifted_p1.rawdigit(d) = carry;
    for(bitset_digit_t bitmask = 2;;)
    {
      for(unsigned int digit = 0; digit < bitset<m>::digits; ++digit)
	if ((M_coefficients.digit(digit) & bitmask))
	{
	  for (unsigned int d = 0; d < shifted_p1.digits; ++d)
	    output[d + digit] ^= shifted_p1.digit(d);
	}
      bitmask <<= 1;		// Next bit.
      if (bitmask == 0)		// Done?
	break;
      // Shift p1 one bit further to the left.
      shifted_p1.template shift_op<1, left, assign>(shifted_p1);
    }
    // Reduce the resulting output of the multiplication.
    reduce(output);
    // Copy the reduced output to `result'.
    std::memcpy(result.digits_ptr(), output, bitset<m>::digits * sizeof(bitset_digit_t));
  }

#if ECC_DEBUG
template<unsigned int m>
struct div_tct {
  bitset_digit_t const* M_p;
  int M_deg;
  int M_low;
  div_tct(bitset<m> const& b, int deg, int low) : M_p(b.digits_ptr()), M_deg(deg), M_low(low) { }
  void print_on(std::ostream& os) const
  {
    int lowbit = (M_low >> bitset_digit_bits_log2) * bitset_digit_bits;
    if (lowbit > 0)
      lowbit = 0;
    for (int b = 2 * m - 1; b >= lowbit; --b)
    {
      if (b == M_deg)
	os << "\e[31m";
      int digitoffset = (b >> bitset_digit_bits_log2);
      bitset_digit_t mask = 1 << (b & (bitset_digit_bits - 1));
      if (M_p[digitoffset] & mask)
	os << '1';
      else
	os << '0';
      if (b == M_low)
	os << "\e[0m";
      if (b == 0)
	os << '.';
    }
  }
};
#endif

template<unsigned int m, unsigned int k>
  polynomial<m, k>&
  polynomial<m, k>::operator/=(polynomial const& e1)
  {
#if ECC_DEBUG
    Dout(dc::polynomial|noprefix_cf, "");
    Dout(dc::polynomial, "Entering polynomial<" << m << ", " << k << ">::operator/=()");
    polynomial<m, k> x(e1.get_bitset());
    polynomial<m, k> y(M_coefficients);
    Dout(dc::polynomial, "x(t) = " << x);
    Dout(dc::polynomial|flush_cf, "y(t) = " << y);
#endif

    // The following algorithm is based on the algorithm
    // described in http://research.sun.com/techrep/2001/smli_tr-2001-95.ps
    // with significant optimization changes by Carlo Wood.

    // Make sure that there is enough space for a full bitset object
    // and align the bitsets on a multiple of bitset_digit_t.
    static unsigned int const digit_offset_UV = ((sizeof(bitset<m>) * 8 - 1) / bitset_digit_bits + 1);
    static unsigned int const offset_UV = digit_offset_UV * bitset_digit_bits;
    // Make room for exponents from at least t^-m till t^2m.
    static unsigned int const digit_size_UV = 3 * digit_offset_UV;
    // Variables A and B do not need this much space.
    static unsigned int const digit_size_AB = bitset<m>::digits;
    // One digit of padding, needed for assembly routine.
    static unsigned int const padding_digit_size = 1;

    // Declare stack space for four variables.
    bitset_digit_t bitpool [5 * padding_digit_size + 2 * digit_size_AB + 2 * digit_size_UV]
        __attribute__ ((__aligned__ (32)));
    std::memset((char*)bitpool, 0, sizeof(bitpool));

    bitset<m>& A(*(bitset<m>*)&bitpool[padding_digit_size]);
    bitset<m>& B(*(bitset<m>*)&bitpool[2 * padding_digit_size + digit_size_AB]);
    bitset<m>& U(*(bitset<m>*)&bitpool[3 * padding_digit_size + 2 * digit_size_AB + digit_offset_UV]);
    bitset<m>& V(*(bitset<m>*)&bitpool[4 * padding_digit_size + 2 * digit_size_AB + digit_size_UV + digit_offset_UV]);

    // The representation of U and V will be done with bitsets of size `digit_size_UV * bitset_digit_bits'.
    // This means that they contain powers of t with a negative exponent.
    // That is not a problem as those are well defined: t^(-n) = 1 / t^n.

    // Let M(t) = t^m + t^k + 1.
#if ECC_DEBUG
    polynomial<m + 1, 1> M("1");
    M.get_bitset().set(k);
    M.get_bitset().set(m);
#endif

    // Let U(t) = y(t) (= M_coefficients).
    Dout(dc::polynomial|flush_cf, "U <- y");
    U = M_coefficients;

    // Guess the maximum and minimum powers to be the possible limits.
    int degU = m - 1;
    int lowU = 0;

    // Let A(t) = x(t).
    Dout(dc::polynomial|flush_cf, "A <- x");
    A = e1.get_bitset();

    // Then
    //
    // A(t) * y(t) = U(t) * x(t)  [mod M(t)].

    // Let V(t) = 0
    // Let B = M(t)
    //
    // Then
    //
    // B(t) * y(t) = V(t) * x(t)  [mod M(t)].
    //
    // Let degA be the highest power of t in A.
    typename bitset<m>::const_reverse_iterator degA = A.rbegin();
    degA.find1();
    Dout(dc::polynomial|flush_cf, "deg(A) == " << degA);

    // Let lowA be the lowest power of t in A.
    typename bitset<m>::const_iterator lowA = A.begin();
    lowA.find1();
    Dout(dc::polynomial|flush_cf, "low(A) == " << lowA);

    unsigned int sizeA = static_cast<unsigned int>(degA) - static_cast<unsigned int>(lowA);

    // Let n = m - deg(A).
    unsigned int n = m - static_cast<unsigned int>(degA);
    //
    // Then B'(t) = B(t) - A(t) * t^n will have a degree less than m.
    // And
    //
    // B'(t) * y(t) = B(t) * y(t) - A(t) * y(t) * t^n =
    //              = V(t) * x(t) - U(t) * x(t) * t^n =
    //              = (V(t) - U(t) * t^n) * x(t) =
    //              = V'(t) * x(t)			[mod M(t)].
    //
    // B <- B'
    Dout(dc::polynomial|flush_cf, "B <- A * t^" << n << " + " << M);
    B.xor_with_zero_padded(A, lowA, static_cast<unsigned int>(degA), n);
    B.template flip<m>();
    B.template flip<k>();
    B.template flip<0>();

    // Let degB be the highest power of t in B.
    typename bitset<m>::const_reverse_iterator degB = B.rbegin();
    degB.find1();
    Dout(dc::polynomial|flush_cf, "deg(B) == " << degB);

    // Let lowB be the lowest power of t in B.
    typename bitset<m>::const_iterator lowB = B.begin();
    lowB.find1();
    Dout(dc::polynomial|flush_cf, "low(B) == " << lowB);

    // V <- V'
    Dout(dc::polynomial|flush_cf, "V <- U * t^" << n << "  [mod " << M << "]");
    V.xor_with_zero_padded(U, 0, m - 1, n);

    int degV = degU + n;
    int lowV = lowU + n;
    
    unsigned int sizeB = static_cast<unsigned int>(degB) - static_cast<unsigned int>(lowB);

    if (sizeA > 0 && sizeB > 0)
      for(;;)
      {
	Dout(dc::polynomial|flush_cf, "A = " << cwprint(div_tct<m>(A, degA, lowA)));
	Dout(dc::polynomial|flush_cf, "B = " << cwprint(div_tct<m>(B, degB, lowB)));
	Dout(dc::polynomial|flush_cf, "U = " << cwprint(div_tct<m>(U, degU, lowU)));
	Dout(dc::polynomial|flush_cf, "V = " << cwprint(div_tct<m>(V, degV, lowV)));
	if (sizeA < sizeB)
	{
	  int left_shift = static_cast<unsigned int>(lowB) - static_cast<unsigned int>(lowA);
	  Dout(dc::polynomial|flush_cf, "B <- B + A * t^" << left_shift);
	  B.xor_with_zero_padded(A, lowA, degA, left_shift);
	  degB.find1();
	  lowB.find1();
	  sizeB = static_cast<unsigned int>(degB) - static_cast<unsigned int>(lowB);
	  Dout(dc::polynomial|flush_cf, "V <- V + U * t^" << left_shift);
	  V.xor_with_zero_padded(U, lowU, degU, left_shift);
          degV = std::max(degV, degU + left_shift);
	  lowV = std::min(lowV, lowU + left_shift);
	  if (sizeB == 0)
	    break;
	}
	else
	{
	  int left_shift = static_cast<unsigned int>(lowA) - static_cast<unsigned int>(lowB);
	  Dout(dc::polynomial|flush_cf, "A <- A + B * t^" << left_shift);
	  A.xor_with_zero_padded(B, lowB, degB, left_shift);
	  degA.find1();
	  lowA.find1();
	  sizeA = static_cast<unsigned int>(degA) - static_cast<unsigned int>(lowA);
	  Dout(dc::polynomial|flush_cf, "U <- U + V * t^" << left_shift);
	  U.xor_with_zero_padded(V, lowV, degV, left_shift);
          degU = std::max(degU, degV + left_shift);
	  lowU = std::min(lowU, lowV + left_shift);
	  if (sizeA == 0)
	    break;
	}
      }

    Dout(dc::polynomial|flush_cf, "A = " << cwprint(div_tct<m>(A, degA, lowA)));
    Dout(dc::polynomial|flush_cf, "B = " << cwprint(div_tct<m>(B, degB, lowB)));
    Dout(dc::polynomial|flush_cf, "U = " << cwprint(div_tct<m>(U, degU, lowU)));
    Dout(dc::polynomial|flush_cf, "V = " << cwprint(div_tct<m>(V, degV, lowV)));

    bitset<m>* R;
    // 'F' (Floating-point polynomial) will be shifted to the right and
    // is therefore defined to run from t^-2m till t^2m.  This means it will
    // be shifted OVER the other bitsets, but we don't need those anymore anyway.
    static unsigned int const offset_F = 2 * offset_UV;
    static unsigned int const size_F = 2 * m + offset_F;
    bitset<size_F>* F;
    int low1, lowR;
#if ECC_DEBUG
    int degR;
#endif
    if (sizeA == 0)
    {
      Dout(dc::polynomial|flush_cf, "R = U");
      R = &U;
      F = (bitset<size_F>*)&bitpool[3 * padding_digit_size + 2 * digit_size_AB - digit_offset_UV];
      low1 = lowA;
      lowR = lowU;
#if ECC_DEBUG
      degR = degU;
#endif
    }
    else if (sizeB == 0)
    {
      Dout(dc::polynomial|flush_cf, "R = V");
      R = &V;
      F = (bitset<size_F>*)&bitpool[4 * padding_digit_size + 2 * digit_size_AB + digit_size_UV - digit_offset_UV];
      low1 = lowB;
      lowR = lowV;
#if ECC_DEBUG
      degR = degV;
#endif
    }

    *F >>= low1;
    lowR -= low1;
#if ECC_DEBUG
    degR -= low1;
#endif
    // Get rid of negative exponents.
    Dout(dc::polynomial|flush_cf, "lowR = " << lowR);
    Dout(dc::polynomial|flush_cf, "R = " << cwprint(div_tct<m>(*R, degR, lowR)));
    if (k >= 32)
    {
      static int const digit_shift_k = k >> bitset_digit_bits_log2;
      static int const bit_shift_k = k & (bitset_digit_bits  - 1);
      static int const digit_shift_m = m >> bitset_digit_bits_log2;
      static int const bit_shift_m = m & (bitset_digit_bits  - 1);
      int first_digit = (lowR + offset_F) >> bitset_digit_bits_log2;
      bitset_digit_t* ptr = F->digits_ptr() + first_digit;
      bitset_digit_t* ptr1 = R->digits_ptr();
      while(ptr < ptr1)
      {
	ptr[digit_shift_k] ^= (*ptr) << bit_shift_k;
	if (bit_shift_k != 0)
	  ptr[digit_shift_k + 1] ^= (*ptr) >> (32 - bit_shift_k);
	ptr[digit_shift_m] ^= (*ptr) << bit_shift_m;
	if (bit_shift_m != 0)
	  ptr[digit_shift_m + 1] ^= (*ptr) >> (32 - bit_shift_m);
	++ptr;
      }
    }
    else
    {
      for (int i = lowR + offset_F; i < offset_F; ++i)
      {
	if (F->test(i))
	{
#if ECC_DEBUG
	  F->flip(i);		// This is not really needed, but prints nicer output below.
#endif
	  F->flip(i + k);
	  F->flip(i + m);
	}
      }
    }
#if ECC_DEBUG
    lowR = 0;
    degR = 2 * m - 1;
#endif
    Dout(dc::polynomial|flush_cf, "R = " << cwprint(div_tct<m>(*R, degR, lowR)));
    reduce(R->digits_ptr());
#if ECC_DEBUG
    degR = m - 1;
#endif
    Dout(dc::polynomial|flush_cf, "R = " << cwprint(div_tct<m>(*R, degR, lowR)));
    M_coefficients = *R;

    return *this;
  }

// Solve x^2 + b x = c.
// We assume that b != 0 and c != 0 and that there are 2 solutions.
// The solutions are x1 and x1 + b.  This means that during the
// 'wiping' of the matrix in order to solve x, one bit of x will
// stay undetermined.  We need to take special care to make sure
// that this will be a bit for which a bit of 'b' is set, otherwise
// we'd return a wrong value.
template<unsigned int m, unsigned int k>
  polynomial<m, k>::polynomial(polynomial<m, k> const& b, polynomial<m, k> const& c) : M_coefficients(c.M_coefficients)
  {
    using namespace libecc;

    bitset<m> reverse_b;
    reverse_b.reset();
    int bits_in_b = 0;
    short b_row;
    for (short i = m - 1; i >= 0; --i)
      if (b.M_coefficients.test(i))
      {
	reverse_b.set(m - 1 - i);
	++bits_in_b;
	b_row = i;
      }
    if (bits_in_b == 0)
    {
      sqrt();
      return;
    }
    else if (bits_in_b != 1)
      b_row = -1;
    
    bitset<m> matrix[m];
    reverse_b.template shift_op<k + 1, libecc::left, libecc::assign>(matrix[k]);
    reverse_b.template shift_op<m - (k + 1), libecc::right, libecc::exor>(matrix[k]);	// reverse_b rotated right k + 1
    reverse_b.template shift_op<1, libecc::left, libecc::exor>(matrix[k]);
    reverse_b.template shift_op<m - (k - 1), libecc::left, libecc::exor>(matrix[k]);
    unsigned short i = k;
    for (unsigned short j = i + 1; j != k; j = (j + 1) % m)
    {
      matrix[i].template shift_op<1, libecc::left, libecc::assign>(matrix[j]);
      if (b.M_coefficients.test(j))
	matrix[j].template set<0>();
      i = j;
    }

#if ECC_DEBUGOUTPUT
    LibEccDout(dc::polynomial, "b Matrix =");
    for (short row2 = m - 1; row2 >= 0; --row2)
      LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[row2]),
	  &bitset<m>::base2_print_on) << "  x" << row2 << "   " << M_coefficients.test(row2) ? '1' : '0');
#endif

    for (unsigned int col = 0; col < m; ++col)
    {
      unsigned int r = 2 * col;
      while(r >= m)
      {
	matrix[r - m].flip(col);
        r -= m - k;
      }
      matrix[r].flip(col);
    }

#if ECC_DEBUGOUTPUT
    LibEccDout(dc::polynomial, "Matrix =");
    for (short row2 = m - 1; row2 >= 0; --row2)
      LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[row2]),
	  &bitset<m>::base2_print_on) << "  x" << row2 << "   " << M_coefficients.test(row2) ? '1' : '0');
#endif

    unsigned short rows[m];
    for (unsigned short i = 0; i < m; ++i)
      rows[i] = i;

    if (bits_in_b == 1)
    {
      for (short row2 = m - 1; row2 >= 0; --row2)
      {
	if (!matrix[row2].any())
	{
	  if (row2 != b_row)
	  {
	    rows[b_row] = row2;
	    rows[row2] = b_row;
	    if (M_coefficients.test(b_row) != M_coefficients.test(row2))
	    {
	      M_coefficients.flip(b_row);
	      M_coefficients.flip(row2);
	    }
	  }
	}
      }
    }

#if ECC_DEBUGOUTPUT
    LibEccDout(dc::polynomial, "After swap: Matrix =");
    for (short row2 = m - 1; row2 >= 0; --row2)
      LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[rows[row2]]),
	  &bitset<m>::base2_print_on)
	  << "  x" << row2 << "   " << (M_coefficients.test(row2) ? '1' : '0')
	  << "   " << (b.get_bitset().test(row2) ? '1' : '0'));
#endif

    for (unsigned short col = 0; col < m; ++col)
    {
      unsigned short row = col;
      for (; row < m; ++row)
      {
	unsigned short mr = rows[row];
	if (matrix[mr].test(col))
	{
	  for (unsigned short row2 = row + 1; row2 < m; ++row2)
	  {
	    unsigned short mr2 = rows[row2];
	    if (matrix[mr2].test(col))
	    {
	      matrix[mr2] ^= matrix[mr];
	      if (!b.get_bitset().test(row2) && !matrix[mr2].any())
	      {
		// This could lead to possible problems.  Therefore wipe with row2 instead of row.
		matrix[mr2] = matrix[mr];
		matrix[mr].reset();
		// Undo previous wiping when necessary.
		if (M_coefficients.test(row) != M_coefficients.test(row2))
		  for (unsigned short row3 = row + 1; row3 < row2; ++row3)
		    M_coefficients.flip(row3);
		if (M_coefficients.test(row2))
		  M_coefficients.flip(row);
		// Continue with row2 as wipe row.
                row = row2;
		mr = mr2;
	      }
	      else if (M_coefficients.test(row))
		M_coefficients.flip(row2);
	    }
	  }

#if ECC_DEBUGOUTPUT
	  LibEccDout(dc::polynomial, "After wipe: Matrix =");
	  for (short row2 = m - 1; row2 >= 0; --row2)
	    LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[rows[row2]]),
		&bitset<m>::base2_print_on)
		<< "  x" << row2 << "   " << (M_coefficients.test(row2) ? '1' : '0')
		<< "   " << (b.get_bitset().test(row2) ? '1' : '0'));
#endif

	  if (row != col)
	  {
	    unsigned short mc = rows[col];
	    rows[col] = mr;
	    rows[row] = mc;
	    if (M_coefficients.test(row) != M_coefficients.test(col))
	    {
	      M_coefficients.flip(row);
	      M_coefficients.flip(col);
	    }
	  }

#if ECC_DEBUGOUTPUT
	  LibEccDout(dc::polynomial, "After swap: Matrix =");
	  for (short row2 = m - 1; row2 >= 0; --row2)
	    LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[rows[row2]]),
		&bitset<m>::base2_print_on)
		<< "  x" << row2 << "   " << (M_coefficients.test(row2) ? '1' : '0')
		<< "   " << (b.get_bitset().test(row2) ? '1' : '0'));
#endif

	  break;
	}
      }
      if (row == m)
      {
	if (b_row == -1)
	{
	  // Again a possible problem, this row `col' needs to be swapped
	  // later with another row that we will run into later (if there
	  // is a solution at all).  See next 'else' block.
	  LibEccDout(dc::polynomial, "Setting b_row to " << col);
          b_row = col;
	}
	else
	{
	  LibEccDout(dc::polynomial, "Swapping row " << col << " with row " << b_row);
	  // Now we did run into this row.
	  // Swap row `col' with `b_row'.
	  unsigned short mc = rows[b_row];
	  rows[b_row] = rows[col];
	  rows[col] = mc;
	  if (M_coefficients.test(b_row) != M_coefficients.test(col))
	  {
	    M_coefficients.flip(b_row);
	    M_coefficients.flip(col);
	  }

#if ECC_DEBUGOUTPUT
	  LibEccDout(dc::polynomial, "After swap: Matrix =");
	  for (short row2 = m - 1; row2 >= 0; --row2)
	    LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[rows[row2]]),
		&bitset<m>::base2_print_on)
		<< "  x" << row2 << "   " << (M_coefficients.test(row2) ? '1' : '0')
		<< "   " << (b.get_bitset().test(row2) ? '1' : '0'));
#endif

          // Now we need to wipe row `col' again.
	  for (unsigned short col2 = b_row + 1; col2 < col; ++col2)
	    if (matrix[mc].test(col2))
	    {
	      matrix[mc] ^= matrix[rows[col2]];
	      if (M_coefficients.test(col2))
		M_coefficients.flip(col);

#if ECC_DEBUGOUTPUT
	      LibEccDout(dc::polynomial, "After wipe of row " << col << " with row " << col2 << ": Matrix =");
	      for (short row2 = m - 1; row2 >= 0; --row2)
		LibEccDout(dc::polynomial, cwprint_using(*static_cast<libecc::bitset_invertible<m, false>*>(&matrix[rows[row2]]),
		    &bitset<m>::base2_print_on)
		    << "  x" << row2 << "   " << (M_coefficients.test(row2) ? '1' : '0')
		    << "   " << (b.get_bitset().test(row2) ? '1' : '0'));
#endif

	    }
	}
      }
    }

    for (short i = m - 1; i >= 0; --i)
    {
      bitset<m> temp = M_coefficients & matrix[rows[i]];
      for (short j = m - 1; j > i; --j)
	if (temp.test(j))
	  M_coefficients.flip(i);
    }
  }

template<unsigned int m, unsigned int k>
  inline bool
  operator==(polynomial<m, k> const& p1, polynomial<m, k> const& p2)
  {
    return p1.M_coefficients == p2.M_coefficients;
  }

template<unsigned int m, unsigned int k>
  inline bool
  operator!=(polynomial<m, k> const& p1, polynomial<m, k> const& p2)
  {
    return p1.M_coefficients != p2.M_coefficients;
  }

template<unsigned int m, unsigned int k>
  inline Operator::bitsetExpression<m, false, false, Operator::bitsetXOR>
  operator+(polynomial<m, k> const& p1, polynomial<m, k> const& p2)
  {
    return Operator::bitsetExpression<m, false, false, Operator::bitsetXOR>(p1.M_coefficients, p2.M_coefficients);
  }

template<unsigned int m, unsigned int k>
  inline Operator::bitsetExpression<m, false, false, Operator::bitsetXOR>
  operator-(polynomial<m, k> const& p1, polynomial<m, k> const& p2)
  {
    return Operator::bitsetExpression<m, false, false, Operator::bitsetXOR>(p1.M_coefficients, p2.M_coefficients);
  }

template<unsigned int m, unsigned int k>
  inline polynomial<m, k>
  operator*(polynomial<m, k> const& p1, polynomial<m, k> const& p2)
  {
    polynomial<m, k> result;
    p1.multiply_with(p2, result.M_coefficients);
    return result;
  }

template<unsigned int m, unsigned int k>
  inline polynomial<m, k>
  operator/(polynomial<m, k> const& e1, polynomial<m, k> const& e2)
  {
    polynomial<m, k> tmp(e1);
    tmp /= e2;
    return tmp;
  }

template<unsigned int m, unsigned int k>
  std::ostream& operator<<(std::ostream& os, polynomial<m, k> const& p)
  {
    p.M_coefficients.base2_print_on(os);
    return os;
  }

} // namespace libecc

#include <libecc/square.hcc>	// File with different copyright.

#endif // LIBECC_POLYNOMIAL_H
