diff options
41 files changed, 21260 insertions, 12302 deletions
diff --git a/configure.ac b/configure.ac index 0a7c6ddf69c..f66ca1577f9 100644 --- a/configure.ac +++ b/configure.ac @@ -1346,6 +1346,93 @@ fi  AM_CONDITIONAL([ENABLE_EXPERIMENTAL], [test x$BUILD_EXPERIMENTAL = xyes])  #end experimental section +# EC dynamic code generation section + +EC_DYNAMIC_SUPPORT="none" +EC_DYNAMIC_ARCH="none" + +AC_ARG_ENABLE([ec-dynamic], +              AC_HELP_STRING([--disable-ec-dynamic], +                             [Disable all dynamic code generation extensions for EC module])) + +AC_ARG_ENABLE([ec-dynamic-intel], +              AC_HELP_STRING([--disable-ec-dynamic-intel], +                             [Disable all INTEL dynamic code generation extensions for EC module])) + +AC_ARG_ENABLE([ec-dynamic-arm], +              AC_HELP_STRING([--disable-ec-dynamic-arm], +                             [Disable all ARM dynamic code generation extensions for EC module])) + +AC_ARG_ENABLE([ec-dynamic-x64], +              AC_HELP_STRING([--disable-ec-dynamic-x64], +                             [Disable dynamic INTEL x64 code generation for EC module])) + +AC_ARG_ENABLE([ec-dynamic-sse], +              AC_HELP_STRING([--disable-ec-dynamic-sse], +                             [Disable dynamic INTEL SSE code generation for EC module])) + +AC_ARG_ENABLE([ec-dynamic-avx], +              AC_HELP_STRING([--disable-ec-dynamic-avx], +                             [Disable dynamic INTEL AVX code generation for EC module])) + +AC_ARG_ENABLE([ec-dynamic-neon], +              AC_HELP_STRING([--disable-ec-dynamic-neon], +                             [Disable dynamic ARM NEON code generation for EC module])) + +if test "x$enable_ec_dynamic" != "xno"; then +  case $host in +    x86_64*) +      if test "x$enable_ec_dynamic_intel" != "xno"; then +        if test "x$enable_ec_dynamic_x64" != "xno"; then +          EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT x64" +          AC_DEFINE(USE_EC_DYNAMIC_X64, 1, [Defined if using dynamic INTEL x64 code]) +        fi +        if test "x$enable_ec_dynamic_sse" != "xno"; then +          EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT sse" +          AC_DEFINE(USE_EC_DYNAMIC_SSE, 1, [Defined if using dynamic INTEL SSE code]) +        fi +        if test "x$enable_ec_dynamic_avx" != "xno"; then +          EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT avx" +          AC_DEFINE(USE_EC_DYNAMIC_AVX, 1, [Defined if using dynamic INTEL AVX code]) +        fi + +        if test "x$EC_DYNAMIC_SUPPORT" != "xnone"; then +          EC_DYNAMIC_ARCH="intel" +        fi +      fi +      ;; +    arm*) +      if test "x$enable_ec_dynamic_arm" != "xno"; then +        if test "x$enable_ec_dynamic_neon" != "xno"; then +          EC_DYNAMIC_SUPPORT="$EC_DYNAMIC_SUPPORT neon" +          AC_DEFINE(USE_EC_DYNAMIC_NEON, 1, [Defined if using dynamic ARM NEON code]) +        fi + +        if test "x$EC_DYNAMIC_SUPPORT" != "xnone"; then +          EC_DYNAMIC_ARCH="arm" +        fi +      fi +      ;; +  esac + +  EC_DYNAMIC_SUPPORT="${EC_DYNAMIC_SUPPORT#none }" +fi + +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_INTEL], [test "x$EC_DYNAMIC_ARCH" = "xintel"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_ARM], [test "x$EC_DYNAMIC_ARCH" = "xarm"]) + +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_X64], [test "x${EC_DYNAMIC_SUPPORT##*x64*}" = "x"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_SSE], [test "x${EC_DYNAMIC_SUPPORT##*sse*}" = "x"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_AVX], [test "x${EC_DYNAMIC_SUPPORT##*avx*}" = "x"]) +AM_CONDITIONAL([ENABLE_EC_DYNAMIC_NEON], [test "x${EC_DYNAMIC_SUPPORT##*neon*}" = "x"]) + +AC_SUBST(USE_EC_DYNAMIC_X64) +AC_SUBST(USE_EC_DYNAMIC_SSE) +AC_SUBST(USE_EC_DYNAMIC_AVX) +AC_SUBST(USE_EC_DYNAMIC_NEON) + +# end EC dynamic code generation section +  dnl libglusterfs.so uses math functions  GF_LDADD="${GF_LDADD} ${MATH_LIB}" @@ -1442,4 +1529,5 @@ echo "Data Classification  : $BUILD_GFDB"  echo "firewalld-config     : $BUILD_FIREWALLD"  echo "Experimental xlators : $BUILD_EXPERIMENTAL"  echo "Events               : $BUILD_EVENTS" +echo "EC dynamic support   : $EC_DYNAMIC_SUPPORT"  echo diff --git a/tests/basic/ec/ec-cpu-extensions.t b/tests/basic/ec/ec-cpu-extensions.t new file mode 100644 index 00000000000..a599a316925 --- /dev/null +++ b/tests/basic/ec/ec-cpu-extensions.t @@ -0,0 +1,61 @@ +#!/bin/bash + +DISPERSE=6 +REDUNDANCY=2 + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +TESTS_EXPECTED_IN_LOOP=96 + +function check_contents +{ +    local src=$1 +    local cs=$2 + +    TEST cp $src $M0/file +    TEST [ -f $M0/file ] + +    for ext in none x64 sse avx; do +        EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +        TEST $CLI volume set $V0 disperse.cpu-extensions $ext +        TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 +        EXPECT_WITHIN $CHILD_UP_TIMEOUT "$DISPERSE" ec_child_up_count $V0 0 + +        EXPECT "$cs" echo $(sha1sum $M0/file | awk '{ print $1 }') +    done + +    TEST rm -f $M0/file +} + +cleanup + +tmp=`mktemp -p ${LOGDIR} -d -t ${0##*/}.XXXXXX` +if [ ! -d $tmp ]; then +    exit 1 +fi + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy $REDUNDANCY $H0:$B0/${V0}{1..$DISPERSE} +TEST $CLI volume set $V0 performance.flush-behind off +EXPECT 'Created' volinfo_field $V0 'Status' +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Started' volinfo_field $V0 'Status' + +TEST dd if=/dev/urandom of=$tmp/file bs=1048576 count=1 +cs_file=$(sha1sum $tmp/file | awk '{ print $1 }') + +for ext in none x64 sse avx; do +    TEST $CLI volume set $V0 disperse.cpu-extensions $ext +    TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 +    EXPECT_WITHIN $CHILD_UP_TIMEOUT "$DISPERSE" ec_child_up_count $V0 0 + +    check_contents $tmp/file $cs_file + +    EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +done + +TEST rm -rf $tmp + +cleanup diff --git a/xlators/cluster/ec/src/Makefile.am b/xlators/cluster/ec/src/Makefile.am index cbdceefdbe0..c5d9ab1812b 100644 --- a/xlators/cluster/ec/src/Makefile.am +++ b/xlators/cluster/ec/src/Makefile.am @@ -12,8 +12,11 @@ ec_sources += ec-dir-write.c  ec_sources += ec-inode-read.c  ec_sources += ec-inode-write.c  ec_sources += ec-combine.c -ec_sources += ec-gf.c  ec_sources += ec-method.c +ec_sources += ec-galois.c +ec_sources += ec-code.c +ec_sources += ec-code-c.c +ec_sources += ec-gf8.c  ec_sources += ec-heal.c  ec_sources += ec-heald.c @@ -24,10 +27,34 @@ ec_headers += ec-data.h  ec_headers += ec-fops.h  ec_headers += ec-common.h  ec_headers += ec-combine.h -ec_headers += ec-gf.h  ec_headers += ec-method.h +ec_headers += ec-galois.h +ec_headers += ec-code.h +ec_headers += ec-code-c.h +ec_headers += ec-gf8.h  ec_headers += ec-heald.h  ec_headers += ec-messages.h +ec_headers += ec-types.h + +if ENABLE_EC_DYNAMIC_INTEL +  ec_sources += ec-code-intel.c +  ec_headers += ec-code-intel.h +endif + +if ENABLE_EC_DYNAMIC_X64 +  ec_sources += ec-code-x64.c +  ec_headers += ec-code-x64.h +endif + +if ENABLE_EC_DYNAMIC_SSE +  ec_sources += ec-code-sse.c +  ec_headers += ec-code-sse.h +endif + +if ENABLE_EC_DYNAMIC_AVX +  ec_sources += ec-code-avx.c +  ec_headers += ec-code-avx.h +endif  ec_ext_sources = $(top_builddir)/xlators/lib/src/libxlator.c diff --git a/xlators/cluster/ec/src/ec-code-avx.c b/xlators/cluster/ec/src/ec-code-avx.c new file mode 100644 index 00000000000..92bd3e83c5e --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-avx.c @@ -0,0 +1,116 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_avx_prolog(ec_code_builder_t *builder) +{ +    builder->loop = builder->address; +} + +static void +ec_code_avx_epilog(ec_code_builder_t *builder) +{ +    ec_code_intel_op_add_i2r(builder, 32, REG_DX); +    ec_code_intel_op_add_i2r(builder, 32, REG_DI); +    ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); +    ec_code_intel_op_jne(builder, builder->loop); + +    ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_avx_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, +                 uint32_t bit) +{ +    if (builder->linear) { +        ec_code_intel_op_mov_m2avx(builder, REG_SI, REG_DX, 1, +                                   idx * builder->width * builder->bits + +                                   bit * builder->width, +                                   dst); +    } else { +        if (builder->base != idx) { +            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, +                                     REG_AX); +            builder->base = idx; +        } +        ec_code_intel_op_mov_m2avx(builder, REG_AX, REG_DX, 1, +                                   bit * builder->width, dst); +    } +} + +static void +ec_code_avx_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ +    ec_code_intel_op_mov_avx2m(builder, src, REG_DI, REG_NULL, 0, +                               bit * builder->width); +} + +static void +ec_code_avx_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    ec_code_intel_op_mov_avx2avx(builder, src, dst); +} + +static void +ec_code_avx_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    ec_code_intel_op_xor_avx2avx(builder, src, dst); +} + +static void +ec_code_avx_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, +                 uint32_t src2) +{ +    ec_code_intel_op_mov_avx2avx(builder, src1, dst); +    ec_code_intel_op_xor_avx2avx(builder, src2, dst); +} + +static void +ec_code_avx_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, +                 uint32_t bit) +{ +    if (builder->linear) { +        ec_code_intel_op_xor_m2avx(builder, REG_SI, REG_DX, 1, +                                   idx * builder->width * builder->bits + +                                   bit * builder->width, +                                   dst); +    } else { +        if (builder->base != idx) { +            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, +                                     REG_AX); +            builder->base = idx; +        } +        ec_code_intel_op_xor_m2avx(builder, REG_AX, REG_DX, 1, +                                   bit * builder->width, dst); +    } +} + +static char *ec_code_avx_needed_flags[] = { +    "avx2", +    NULL +}; + +ec_code_gen_t ec_code_gen_avx = { +    .name   = "avx", +    .flags  = ec_code_avx_needed_flags, +    .width  = 32, +    .prolog = ec_code_avx_prolog, +    .epilog = ec_code_avx_epilog, +    .load   = ec_code_avx_load, +    .store  = ec_code_avx_store, +    .copy   = ec_code_avx_copy, +    .xor2   = ec_code_avx_xor2, +    .xor3   = ec_code_avx_xor3, +    .xorm   = ec_code_avx_xorm +}; diff --git a/xlators/cluster/ec/src/ec-code-avx.h b/xlators/cluster/ec/src/ec-code-avx.h new file mode 100644 index 00000000000..fdca4ad2c8f --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-avx.h @@ -0,0 +1,18 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_AVX_H__ +#define __EC_CODE_AVX_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_avx; + +#endif /* __EC_CODE_AVX_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-c.c b/xlators/cluster/ec/src/ec-code-c.c new file mode 100644 index 00000000000..7387f3ea435 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-c.c @@ -0,0 +1,11431 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <inttypes.h> +#include <string.h> + +#include "ec-method.h" +#include "ec-code-c.h" + +#define WIDTH (EC_METHOD_WORD_SIZE / sizeof(uint64_t)) + +static void gf8_muladd_00(void *out, void *in) +{ +    memcpy(out, in, EC_METHOD_WORD_SIZE * 8); +} + +static void gf8_muladd_01(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        out_ptr[0] ^= in_ptr[0]; +        out_ptr[WIDTH] ^= in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] ^= in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] ^= in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] ^= in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] ^= in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] ^= in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] ^= in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_02(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in7; +        out1 = in0; +        out7 = in6; +        out5 = in4; +        out6 = in5; +        out3 = in2 ^ in7; +        out4 = in3 ^ in7; +        out2 = in1 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_03(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in0 ^ in7; +        tmp0 = in2 ^ in7; +        out1 = in0 ^ in1; +        out7 = in6 ^ in7; +        out5 = in4 ^ in5; +        out6 = in5 ^ in6; +        out4 = in3 ^ in4 ^ in7; +        out2 = tmp0 ^ in1; +        out3 = tmp0 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_04(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in6; +        out1 = in7; +        out7 = in5; +        out6 = in4; +        tmp0 = in6 ^ in7; +        out2 = in0 ^ in6; +        out5 = in3 ^ in7; +        out3 = tmp0 ^ in1; +        out4 = tmp0 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_05(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in0 ^ in6; +        out1 = in1 ^ in7; +        out7 = in5 ^ in7; +        out6 = in4 ^ in6; +        out2 = out0 ^ in2; +        out3 = out1 ^ in3 ^ in6; +        out5 = out7 ^ in3; +        out4 = out6 ^ in2 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_06(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in6 ^ in7; +        tmp0 = in1 ^ in6; +        out1 = in0 ^ in7; +        out7 = in5 ^ in6; +        out6 = in4 ^ in5; +        out4 = in2 ^ in3 ^ in6; +        out5 = in3 ^ in4 ^ in7; +        out3 = tmp0 ^ in2; +        out2 = tmp0 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_07(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in6; +        tmp1 = in5 ^ in6; +        tmp2 = in0 ^ in7; +        tmp3 = tmp0 ^ in3; +        out6 = tmp1 ^ in4; +        out7 = tmp1 ^ in7; +        out0 = tmp2 ^ in6; +        out1 = tmp2 ^ in1; +        out3 = tmp3 ^ in1; +        out4 = tmp3 ^ in4; +        out5 = out4 ^ out7 ^ in2; +        out2 = tmp0 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_08(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in5; +        out1 = in6; +        out7 = in4; +        out6 = in3 ^ in7; +        out3 = in0 ^ in5 ^ in6; +        out5 = in2 ^ in6 ^ in7; +        out2 = in5 ^ in7; +        out4 = out2 ^ in1 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_09(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in0 ^ in5; +        tmp0 = in3 ^ in6; +        out1 = in1 ^ in6; +        out7 = in4 ^ in7; +        out2 = in2 ^ in5 ^ in7; +        out3 = tmp0 ^ out0; +        out6 = tmp0 ^ in7; +        out4 = out1 ^ out7 ^ in5; +        out5 = out2 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_0A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in5 ^ in7; +        out1 = in0 ^ in6; +        out7 = in4 ^ in6; +        out2 = in1 ^ in5; +        out6 = out0 ^ in3; +        out3 = out0 ^ out1 ^ in2; +        out5 = out7 ^ in2 ^ in7; +        out4 = out2 ^ in3 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_0B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        tmp1 = in0 ^ in6; +        tmp2 = in4 ^ in7; +        out0 = in0 ^ in5 ^ in7; +        out2 = tmp0 ^ in1; +        out1 = tmp1 ^ in1; +        out6 = tmp1 ^ out0 ^ in3; +        out7 = tmp2 ^ in6; +        out4 = tmp2 ^ out6 ^ in1; +        out3 = out6 ^ in0 ^ in2; +        out5 = tmp0 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_0C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in5 ^ in6; +        out1 = in6 ^ in7; +        out7 = in4 ^ in5; +        tmp0 = in1 ^ in5; +        tmp1 = in0 ^ in7; +        out5 = in2 ^ in3 ^ in6; +        out6 = in3 ^ in4 ^ in7; +        out2 = tmp1 ^ out0; +        out4 = tmp0 ^ in2; +        out3 = tmp0 ^ tmp1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_0D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in5; +        tmp1 = in5 ^ in6; +        out1 = in1 ^ in6 ^ in7; +        out7 = tmp0 ^ in7; +        out4 = tmp0 ^ in1 ^ in2; +        out0 = tmp1 ^ in0; +        tmp2 = tmp1 ^ in3; +        out6 = tmp2 ^ out7; +        out2 = out0 ^ in2 ^ in7; +        out3 = out0 ^ out1 ^ in3; +        out5 = tmp2 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_0E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in2 ^ in5; +        tmp2 = in5 ^ in6; +        out1 = in0 ^ in6 ^ in7; +        out3 = tmp0 ^ tmp1; +        out2 = tmp0 ^ tmp2; +        tmp3 = tmp1 ^ in3; +        out7 = tmp2 ^ in4; +        out0 = tmp2 ^ in7; +        out4 = tmp3 ^ in1 ^ in7; +        out5 = tmp3 ^ out7; +        out6 = out0 ^ out5 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_0F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in6 ^ in7; +        tmp1 = tmp0 ^ in1; +        tmp2 = tmp0 ^ in5; +        out1 = tmp1 ^ in0; +        out7 = tmp2 ^ in4; +        out0 = tmp2 ^ in0; +        out6 = out7 ^ in3; +        out5 = out6 ^ in2 ^ in7; +        tmp3 = tmp1 ^ out0 ^ in2; +        out4 = tmp1 ^ out5; +        out2 = tmp3 ^ in6; +        out3 = tmp3 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_10(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in4; +        out1 = in5; +        out7 = in3 ^ in7; +        tmp0 = in6 ^ in7; +        out2 = in4 ^ in6; +        tmp1 = out2 ^ in5; +        out6 = tmp0 ^ in2; +        out3 = tmp0 ^ tmp1; +        out5 = out2 ^ out3 ^ in1; +        out4 = tmp1 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_11(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out7 = in3; +        out0 = in0 ^ in4; +        out1 = in1 ^ in5; +        out6 = in2 ^ in7; +        out4 = in0 ^ in5 ^ in6; +        out5 = in1 ^ in6 ^ in7; +        out2 = in2 ^ in4 ^ in6; +        out3 = in3 ^ in4 ^ in5 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_12(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in4 ^ in7; +        out1 = in0 ^ in5; +        out3 = in2 ^ in4 ^ in5; +        tmp0 = out0 ^ in6; +        out2 = tmp0 ^ in1; +        tmp1 = tmp0 ^ in3; +        out6 = tmp0 ^ out3; +        out5 = out2 ^ in5; +        out7 = tmp1 ^ in4; +        out4 = tmp1 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_13(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out7 = in3 ^ in6; +        tmp0 = in0 ^ in5; +        tmp1 = in4 ^ in7; +        out6 = in2 ^ in5 ^ in7; +        out4 = tmp0 ^ out7 ^ in7; +        out1 = tmp0 ^ in1; +        out0 = tmp1 ^ in0; +        out5 = tmp1 ^ in1 ^ in6; +        out3 = tmp1 ^ out6 ^ in3; +        out2 = out5 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_14(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in4 ^ in6; +        out1 = in5 ^ in7; +        out2 = in0 ^ in4; +        tmp0 = out0 ^ in5; +        out7 = out1 ^ in3; +        tmp1 = out1 ^ in2; +        out3 = tmp0 ^ in1; +        out6 = tmp0 ^ tmp1; +        out4 = tmp1 ^ out2; +        out5 = out3 ^ in3 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_15(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out7 = in3 ^ in5; +        tmp0 = in0 ^ in4; +        out1 = in1 ^ in5 ^ in7; +        out5 = in1 ^ in3 ^ in6; +        out0 = tmp0 ^ in6; +        out2 = tmp0 ^ in2; +        out3 = out5 ^ in4 ^ in5; +        out6 = out2 ^ in0 ^ in7; +        out4 = tmp0 ^ out6 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_16(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in5; +        tmp1 = in4 ^ in7; +        tmp2 = in2 ^ in3 ^ in4; +        out1 = tmp0 ^ in7; +        out4 = tmp0 ^ tmp2; +        out0 = tmp1 ^ in6; +        tmp3 = tmp1 ^ in1; +        out6 = out0 ^ in2 ^ in5; +        out2 = tmp3 ^ in0; +        out3 = out6 ^ in1; +        out7 = tmp2 ^ out6; +        out5 = tmp3 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_17(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        tmp1 = in3 ^ in6; +        tmp2 = tmp0 ^ in4; +        out4 = tmp0 ^ in0 ^ in3; +        out7 = tmp1 ^ in5; +        tmp3 = tmp1 ^ in1; +        out6 = tmp2 ^ in7; +        out5 = tmp3 ^ in4; +        out3 = tmp3 ^ out6; +        out0 = out3 ^ out4 ^ in1; +        out2 = out3 ^ out7 ^ in0; +        out1 = tmp2 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_18(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in4 ^ in5; +        out1 = in5 ^ in6; +        tmp0 = in4 ^ in7; +        out5 = in1 ^ in2 ^ in5; +        out6 = in2 ^ in3 ^ in6; +        out2 = tmp0 ^ out1; +        out7 = tmp0 ^ in3; +        tmp1 = tmp0 ^ in0; +        out3 = tmp1 ^ in6; +        out4 = tmp1 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_19(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in1 ^ in2; +        out7 = in3 ^ in4; +        tmp0 = in0 ^ in7; +        out6 = in2 ^ in3; +        out1 = in1 ^ in5 ^ in6; +        out0 = in0 ^ in4 ^ in5; +        out4 = tmp0 ^ in1; +        tmp1 = tmp0 ^ in6; +        out2 = tmp1 ^ out0 ^ in2; +        out3 = tmp1 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_1A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in5; +        tmp1 = in5 ^ in6; +        tmp2 = tmp0 ^ in1; +        out0 = tmp0 ^ in7; +        out1 = tmp1 ^ in0; +        tmp3 = tmp1 ^ in3; +        out5 = tmp2 ^ in2; +        out2 = tmp2 ^ in6; +        out7 = tmp3 ^ out0; +        out6 = tmp3 ^ in2; +        out4 = tmp3 ^ out2 ^ in0; +        out3 = tmp0 ^ out1 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_1B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in4; +        tmp1 = in2 ^ in5; +        tmp2 = in3 ^ in6; +        out5 = tmp0 ^ in1; +        tmp3 = tmp0 ^ in0; +        out6 = tmp1 ^ in3; +        out0 = tmp1 ^ tmp3 ^ in7; +        out7 = tmp2 ^ in4; +        tmp4 = out5 ^ in6; +        out3 = tmp2 ^ tmp3; +        out2 = tmp4 ^ in5; +        out4 = tmp4 ^ out3; +        out1 = tmp3 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_1C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        tmp1 = in4 ^ in6; +        tmp2 = in5 ^ in7; +        out6 = tmp0 ^ tmp1; +        out0 = tmp1 ^ in5; +        out1 = tmp2 ^ in6; +        tmp3 = tmp2 ^ in1; +        tmp4 = tmp2 ^ in4; +        out2 = tmp4 ^ in0; +        out7 = tmp4 ^ in3; +        out5 = tmp0 ^ tmp3; +        out3 = tmp3 ^ out2; +        out4 = out3 ^ in2 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_1D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in3; +        tmp1 = in0 ^ in4; +        tmp2 = in3 ^ in4; +        tmp3 = in2 ^ in7; +        out3 = tmp0 ^ tmp1; +        out5 = tmp0 ^ tmp3; +        tmp4 = tmp1 ^ in5; +        out6 = tmp2 ^ in2; +        out7 = tmp2 ^ in5; +        out2 = tmp3 ^ tmp4; +        out4 = out3 ^ out6 ^ in6; +        out0 = tmp4 ^ in6; +        out1 = out2 ^ out4 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_1E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in4; +        tmp1 = in2 ^ in7; +        tmp2 = tmp0 ^ in1; +        out3 = tmp1 ^ tmp2; +        out2 = tmp2 ^ in5; +        out4 = out3 ^ in3 ^ in6; +        tmp3 = out4 ^ in7; +        out6 = tmp3 ^ out2 ^ in4; +        out7 = tmp1 ^ out6; +        out0 = out7 ^ in3; +        out1 = tmp0 ^ out0; +        out5 = tmp3 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_1F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in6; +        tmp1 = tmp0 ^ in5; +        out7 = tmp1 ^ in3; +        out0 = tmp1 ^ in0 ^ in7; +        out6 = out7 ^ in2 ^ in6; +        out1 = out0 ^ in1 ^ in4; +        out4 = out0 ^ out6 ^ in1; +        out3 = tmp0 ^ out4; +        out2 = out4 ^ out7 ^ in7; +        out5 = out3 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_20(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in4; +        out0 = in3 ^ in7; +        tmp0 = in3 ^ in4; +        tmp1 = in6 ^ in7; +        out2 = out0 ^ in5; +        out4 = tmp0 ^ in5; +        out3 = tmp0 ^ tmp1; +        out7 = tmp1 ^ in2; +        out6 = tmp1 ^ in1 ^ in5; +        out5 = out2 ^ out3 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_21(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in1 ^ in4; +        tmp0 = in4 ^ in6; +        out4 = in3 ^ in5; +        out7 = in2 ^ in6; +        out0 = in0 ^ in3 ^ in7; +        out6 = in1 ^ in5 ^ in7; +        out3 = tmp0 ^ in7; +        out5 = tmp0 ^ in0; +        out2 = out4 ^ in2 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_22(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in3; +        out1 = in0 ^ in4; +        out7 = in2 ^ in7; +        out4 = in4 ^ in5 ^ in7; +        out5 = in0 ^ in5 ^ in6; +        out6 = in1 ^ in6 ^ in7; +        out3 = in2 ^ in3 ^ in4 ^ in6; +        out2 = in1 ^ in3 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_23(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out7 = in2; +        out0 = in0 ^ in3; +        out4 = in5 ^ in7; +        out5 = in0 ^ in6; +        out6 = in1 ^ in7; +        out3 = in2 ^ in4 ^ in6; +        out1 = in0 ^ in1 ^ in4; +        out2 = out4 ^ out6 ^ in2 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_24(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in4 ^ in7; +        tmp0 = in3 ^ in4; +        out0 = in3 ^ in6 ^ in7; +        out3 = tmp0 ^ in1; +        tmp1 = out0 ^ in5; +        out6 = tmp1 ^ out3; +        out2 = tmp1 ^ in0; +        out7 = tmp1 ^ in2 ^ in3; +        out5 = out2 ^ in4; +        out4 = tmp0 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_25(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in1 ^ in4; +        tmp0 = in2 ^ in5; +        out1 = out3 ^ in7; +        out7 = tmp0 ^ in6; +        out6 = out1 ^ in5; +        out4 = out7 ^ in3 ^ in7; +        out2 = out4 ^ in0; +        out0 = tmp0 ^ out2; +        out5 = out0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_26(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in3 ^ in6; +        tmp0 = in4 ^ in7; +        out7 = in2 ^ in5 ^ in7; +        tmp1 = out0 ^ in0 ^ in5; +        out1 = tmp0 ^ in0; +        tmp2 = tmp0 ^ in6; +        out2 = tmp1 ^ in1; +        out5 = tmp1 ^ in7; +        out6 = tmp2 ^ in1; +        out4 = tmp2 ^ out7; +        out3 = out0 ^ out6 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_27(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out7 = in2 ^ in5; +        out0 = in0 ^ in3 ^ in6; +        out6 = in1 ^ in4 ^ in7; +        out4 = out7 ^ in6; +        out2 = out0 ^ out7 ^ in1; +        out5 = out0 ^ in7; +        out1 = out6 ^ in0; +        out3 = out6 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_28(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in3; +        out1 = in4 ^ in6; +        out0 = in3 ^ in5 ^ in7; +        tmp0 = out1 ^ in7; +        tmp1 = out0 ^ in4; +        out7 = tmp0 ^ in2; +        tmp2 = tmp0 ^ in1; +        out3 = tmp1 ^ in0; +        out6 = tmp1 ^ tmp2; +        out4 = tmp2 ^ in3; +        out5 = out3 ^ in2 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_29(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in2 ^ in3; +        tmp0 = in1 ^ in3; +        tmp1 = in4 ^ in6; +        tmp2 = in0 ^ in4 ^ in7; +        out6 = tmp0 ^ in5; +        out4 = tmp0 ^ in6 ^ in7; +        out1 = tmp1 ^ in1; +        out7 = tmp1 ^ in2; +        out3 = tmp2 ^ in5; +        out5 = tmp2 ^ in2; +        out0 = out3 ^ in3 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_2A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in3 ^ in5; +        tmp0 = in1 ^ in3; +        tmp1 = in0 ^ in4; +        out7 = in2 ^ in4 ^ in7; +        out3 = tmp1 ^ out0 ^ in2; +        out2 = tmp0 ^ in7; +        out6 = tmp0 ^ in6; +        out1 = tmp1 ^ in6; +        out5 = tmp1 ^ out7 ^ in5; +        out4 = out1 ^ in0 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_2B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in1 ^ in6; +        out7 = in2 ^ in4; +        tmp0 = in0 ^ in5; +        tmp1 = in2 ^ in7; +        out6 = in1 ^ in3; +        out1 = out4 ^ in0 ^ in4; +        out3 = tmp0 ^ out7; +        out0 = tmp0 ^ in3; +        out5 = tmp1 ^ in0; +        out2 = tmp1 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_2C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        tmp1 = in2 ^ in3 ^ in4; +        tmp2 = tmp0 ^ in6; +        out4 = tmp1 ^ in1; +        out5 = tmp1 ^ in0 ^ in5; +        tmp3 = tmp2 ^ in4; +        out6 = tmp2 ^ out4; +        out7 = tmp3 ^ in7; +        out2 = tmp3 ^ out5; +        out3 = out6 ^ in0; +        out0 = tmp1 ^ out7; +        out1 = tmp0 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_2D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        out4 = tmp0 ^ in1; +        tmp1 = tmp0 ^ in0; +        out2 = tmp1 ^ in6; +        out5 = tmp1 ^ in4; +        tmp2 = out2 ^ in2; +        tmp3 = tmp2 ^ in5; +        out0 = tmp3 ^ in7; +        out7 = tmp3 ^ out5; +        out6 = out4 ^ out7 ^ in6; +        out3 = tmp2 ^ out6; +        out1 = out0 ^ out6 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_2E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in7; +        out0 = in3 ^ in5 ^ in6; +        tmp1 = tmp0 ^ in0; +        tmp2 = tmp0 ^ in2; +        out1 = tmp1 ^ in6; +        out4 = tmp2 ^ in1; +        out7 = tmp2 ^ in5; +        out3 = out0 ^ out4 ^ in0; +        out2 = out3 ^ out7 ^ in7; +        out6 = tmp1 ^ out2; +        out5 = tmp1 ^ out7 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_2F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = in2 ^ in5; +        out4 = in1 ^ in2 ^ in7; +        out6 = in1 ^ in3 ^ in4; +        out5 = tmp0 ^ in2; +        tmp2 = tmp0 ^ in6; +        out7 = tmp1 ^ in4; +        out0 = tmp2 ^ in5; +        out2 = tmp2 ^ out4; +        out1 = tmp2 ^ out6 ^ in7; +        out3 = tmp1 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_30(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in4 ^ in5; +        tmp0 = in3 ^ in6; +        tmp1 = in4 ^ in7; +        out6 = in1 ^ in2 ^ in5; +        out3 = tmp0 ^ in5; +        out4 = tmp0 ^ in0; +        out7 = tmp0 ^ in2; +        out0 = tmp1 ^ in3; +        out2 = tmp1 ^ out3; +        out5 = tmp1 ^ in0 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_31(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in5 ^ in6; +        tmp0 = in4 ^ in5; +        tmp1 = in0 ^ in3 ^ in4; +        tmp2 = out3 ^ in2; +        out1 = tmp0 ^ in1; +        out0 = tmp1 ^ in7; +        out4 = tmp1 ^ in6; +        out6 = tmp2 ^ in1; +        out2 = tmp2 ^ out0 ^ in0; +        out5 = out1 ^ in0 ^ in7; +        out7 = tmp0 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_32(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in3 ^ in4; +        out7 = in2 ^ in3; +        tmp0 = in5 ^ in6; +        tmp1 = in0 ^ in7; +        out6 = in1 ^ in2; +        out1 = in0 ^ in4 ^ in5; +        out2 = tmp0 ^ out0 ^ in1; +        out3 = tmp0 ^ out7 ^ in7; +        out4 = tmp1 ^ in6; +        out5 = tmp1 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_33(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        tmp1 = in0 ^ in4; +        tmp2 = in1 ^ in5; +        out6 = in1 ^ in2 ^ in6; +        out7 = tmp0 ^ in7; +        out0 = tmp1 ^ in3; +        out1 = tmp1 ^ tmp2; +        tmp3 = tmp2 ^ in7; +        tmp4 = tmp2 ^ in4 ^ in6; +        out5 = tmp3 ^ in0; +        out3 = tmp3 ^ out6; +        out4 = tmp4 ^ out5; +        out2 = tmp0 ^ tmp4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_34(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in4; +        tmp1 = in4 ^ in5; +        tmp2 = tmp0 ^ in1; +        tmp3 = tmp0 ^ in6; +        out1 = tmp1 ^ in7; +        tmp4 = tmp1 ^ in2; +        out5 = tmp2 ^ in0; +        out3 = tmp2 ^ out1; +        out0 = tmp3 ^ in7; +        out7 = tmp3 ^ tmp4; +        out6 = tmp4 ^ in1; +        out2 = out3 ^ out5 ^ in3; +        out4 = tmp4 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_35(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in6; +        tmp1 = in5 ^ in7; +        out7 = tmp0 ^ tmp1 ^ in3; +        out3 = tmp1 ^ in1; +        out1 = out3 ^ in4; +        tmp2 = out1 ^ in7; +        out5 = tmp2 ^ in0 ^ in3; +        out6 = tmp0 ^ tmp2; +        out0 = out3 ^ out5 ^ in6; +        out4 = tmp0 ^ out0; +        out2 = out4 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_36(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in0 ^ in2; +        tmp0 = in1 ^ in3; +        out0 = in3 ^ in4 ^ in6; +        out6 = in1 ^ in2 ^ in4; +        out5 = tmp0 ^ in0; +        tmp1 = out5 ^ in5; +        out2 = tmp1 ^ in4; +        out3 = tmp1 ^ out4; +        out1 = tmp0 ^ out2 ^ in7; +        out7 = out3 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_37(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = in2 ^ in4; +        tmp2 = tmp0 ^ in6; +        out3 = tmp0 ^ in5; +        out4 = tmp1 ^ in0; +        out6 = tmp2 ^ in4; +        out1 = out3 ^ out4 ^ in7; +        tmp3 = out4 ^ in1 ^ in3; +        out7 = tmp3 ^ out1; +        out2 = tmp3 ^ in5; +        out5 = tmp1 ^ out2; +        out0 = tmp2 ^ tmp3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_38(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0 ^ in3; +        tmp0 = in3 ^ in4; +        tmp1 = in5 ^ in7; +        tmp2 = out3 ^ in1; +        out2 = tmp0 ^ in6; +        out0 = tmp0 ^ tmp1; +        out4 = tmp1 ^ tmp2; +        out7 = out2 ^ in2; +        out1 = out2 ^ in3 ^ in5; +        out6 = out4 ^ in0 ^ in2; +        out5 = tmp2 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_39(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0; +        tmp0 = in1 ^ in5; +        tmp1 = tmp0 ^ in4; +        out1 = tmp1 ^ in6; +        out5 = out1 ^ in0 ^ in2; +        tmp2 = tmp0 ^ out5; +        out2 = tmp2 ^ in0 ^ in3; +        out7 = out2 ^ in7; +        out6 = tmp1 ^ out7; +        out4 = tmp2 ^ out6; +        out0 = out4 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_3A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in0 ^ in2; +        tmp2 = in3 ^ in4; +        tmp3 = in1 ^ in6; +        tmp4 = in3 ^ in7; +        out4 = tmp0 ^ in5; +        out5 = tmp1 ^ tmp3; +        out3 = tmp1 ^ tmp4; +        out0 = tmp2 ^ in5; +        out7 = tmp2 ^ in2; +        tmp5 = tmp3 ^ in4; +        out2 = tmp4 ^ tmp5; +        out1 = tmp5 ^ out4; +        out6 = tmp0 ^ out3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_3B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in6; +        tmp1 = in2 ^ in7; +        tmp2 = tmp0 ^ in3; +        out3 = tmp1 ^ in0; +        out6 = tmp1 ^ tmp2; +        out2 = out6 ^ in4; +        out7 = tmp0 ^ out2; +        out0 = out3 ^ out7 ^ in5; +        out5 = out0 ^ out2 ^ in7; +        out1 = tmp2 ^ out0; +        out4 = out1 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_3C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = in2 ^ in7; +        tmp2 = in1 ^ in6 ^ in7; +        out2 = tmp0 ^ in4; +        out3 = tmp0 ^ tmp2; +        out4 = tmp1 ^ out3 ^ in5; +        out5 = tmp2 ^ out2 ^ in2; +        out1 = out4 ^ out5 ^ in6; +        out0 = out1 ^ in3; +        out7 = tmp1 ^ out0; +        out6 = tmp2 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_3D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in2; +        tmp1 = tmp0 ^ in3; +        out2 = tmp1 ^ in4; +        tmp2 = out2 ^ in5; +        out4 = tmp2 ^ in1 ^ in6; +        out5 = out4 ^ in7; +        out6 = out5 ^ in0; +        out7 = out6 ^ in1; +        out0 = tmp0 ^ out7; +        out1 = tmp1 ^ out5; +        out3 = tmp2 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_3E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in5; +        tmp1 = tmp0 ^ in4; +        out0 = tmp1 ^ in6; +        out7 = tmp1 ^ in2; +        out6 = out7 ^ in1 ^ in5 ^ in7; +        out2 = out6 ^ in0 ^ in2; +        out4 = out0 ^ out6 ^ in0; +        out5 = tmp0 ^ out4; +        out3 = out5 ^ in7; +        out1 = out3 ^ out6 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_3F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        out3 = tmp0 ^ in2 ^ in6; +        tmp1 = out3 ^ in5 ^ in7; +        out4 = tmp1 ^ in4; +        out5 = tmp1 ^ in3; +        out1 = out4 ^ in2; +        out7 = out1 ^ out3 ^ in3; +        out2 = tmp0 ^ out7 ^ in5; +        tmp2 = out2 ^ in0; +        out6 = tmp2 ^ in6; +        out0 = tmp1 ^ tmp2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_40(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in3 ^ in7; +        tmp0 = in3 ^ in4; +        tmp1 = in6 ^ in7; +        out4 = tmp0 ^ in2; +        out5 = tmp0 ^ in5; +        out0 = tmp1 ^ in2; +        out7 = tmp1 ^ in1 ^ in5; +        out2 = out0 ^ in4; +        out3 = out2 ^ out5 ^ in7; +        out6 = out3 ^ out4 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_41(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in2 ^ in3; +        tmp0 = in5 ^ in6; +        tmp1 = in6 ^ in7; +        out5 = in3 ^ in4; +        out1 = in1 ^ in3 ^ in7; +        out6 = in0 ^ in4 ^ in5; +        out3 = tmp0 ^ in2; +        out7 = tmp0 ^ in1; +        out2 = tmp1 ^ in4; +        out0 = tmp1 ^ in0 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_42(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in2 ^ in6; +        out5 = in3 ^ in5; +        out1 = in0 ^ in3 ^ in7; +        out7 = in1 ^ in5 ^ in7; +        out4 = in2 ^ in4 ^ in7; +        out6 = in0 ^ in4 ^ in6; +        out2 = out0 ^ in1 ^ in4; +        out3 = out5 ^ in6 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_43(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in3; +        out7 = in1 ^ in5; +        out4 = in2 ^ in7; +        out6 = in0 ^ in4; +        out0 = in0 ^ in2 ^ in6; +        out3 = in5 ^ in6 ^ in7; +        out2 = in1 ^ in4 ^ in6; +        out1 = in0 ^ in1 ^ in3 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_44(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in3; +        out0 = in2 ^ in7; +        tmp0 = in4 ^ in7; +        out7 = in1 ^ in6 ^ in7; +        out6 = in0 ^ in5 ^ in6; +        out4 = tmp0 ^ in3 ^ in6; +        out3 = out0 ^ in1 ^ in3 ^ in5; +        out2 = out0 ^ in0 ^ in4; +        out5 = tmp0 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_45(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in1 ^ in3; +        out7 = in1 ^ in6; +        out5 = in4 ^ in7; +        out6 = in0 ^ in5; +        out0 = in0 ^ in2 ^ in7; +        out4 = in3 ^ in6 ^ in7; +        out2 = out5 ^ in0; +        out3 = out0 ^ out6 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_46(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in2; +        out1 = in0 ^ in3; +        out7 = in1 ^ in7; +        out4 = in4 ^ in6; +        out5 = in5 ^ in7; +        out6 = in0 ^ in6; +        out3 = in1 ^ in3 ^ in5; +        out2 = out4 ^ out6 ^ in1 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_47(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in6; +        out7 = in1; +        out5 = in7; +        out6 = in0; +        tmp0 = in0 ^ in1; +        out3 = in1 ^ in5; +        out0 = in0 ^ in2; +        out1 = tmp0 ^ in3; +        out2 = tmp0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_48(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        out1 = in3 ^ in6 ^ in7; +        out3 = tmp0 ^ in0; +        out0 = tmp0 ^ out1 ^ in5; +        tmp1 = out0 ^ in4; +        out2 = tmp1 ^ in7; +        out5 = tmp1 ^ in3; +        out4 = out5 ^ in1; +        out7 = tmp0 ^ out4; +        out6 = tmp1 ^ out3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_49(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0 ^ in2; +        tmp0 = in2 ^ in5; +        out2 = in4 ^ in5 ^ in6; +        tmp1 = tmp0 ^ out2 ^ in3; +        out7 = out2 ^ in1; +        out5 = tmp1 ^ in7; +        out4 = out5 ^ out7 ^ in6; +        out1 = tmp0 ^ out4; +        out6 = out1 ^ out7 ^ in0; +        out0 = tmp1 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_4A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in6; +        tmp1 = in3 ^ in7; +        out0 = tmp0 ^ in5; +        out3 = tmp1 ^ in0; +        out5 = tmp1 ^ out0; +        out4 = out0 ^ in1 ^ in4; +        out1 = out3 ^ in6; +        out2 = out4 ^ in7; +        out6 = out1 ^ in4; +        out7 = tmp0 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_4B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0 ^ in7; +        tmp0 = in1 ^ in5; +        tmp1 = in2 ^ in6; +        tmp2 = out3 ^ in3; +        out7 = tmp0 ^ in4; +        out4 = tmp0 ^ tmp1; +        tmp3 = tmp1 ^ in0; +        out6 = tmp2 ^ in4; +        out5 = tmp2 ^ tmp3; +        out1 = tmp2 ^ in1 ^ in6; +        out2 = out7 ^ in6 ^ in7; +        out0 = tmp3 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_4C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in3 ^ in6; +        tmp0 = in2 ^ in5; +        tmp1 = out1 ^ in5 ^ in7; +        out0 = tmp0 ^ in7; +        tmp2 = tmp0 ^ in4; +        out6 = tmp1 ^ in0; +        out2 = tmp2 ^ in0; +        out5 = tmp2 ^ in6; +        out3 = tmp0 ^ out6 ^ in1; +        out7 = out0 ^ out5 ^ in1; +        out4 = tmp1 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_4D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in5; +        tmp1 = in1 ^ in6; +        out4 = in1 ^ in3 ^ in5; +        tmp2 = tmp0 ^ in7; +        out2 = tmp0 ^ in4; +        out1 = tmp1 ^ in3; +        out7 = tmp1 ^ in4; +        out0 = tmp2 ^ in2; +        out6 = tmp2 ^ in3; +        out5 = out7 ^ in1 ^ in2; +        out3 = tmp1 ^ out0 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_4E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in2 ^ in5; +        out7 = in1 ^ in4 ^ in7; +        out1 = in0 ^ in3 ^ in6; +        out5 = out0 ^ in6; +        out4 = out7 ^ in5; +        out3 = out1 ^ in1; +        out6 = out1 ^ in7; +        out2 = out4 ^ in0 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_4F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in2 ^ in6; +        out7 = in1 ^ in4; +        out3 = in0 ^ in1 ^ in6; +        out4 = in1 ^ in5 ^ in7; +        out0 = in0 ^ in2 ^ in5; +        out6 = in0 ^ in3 ^ in7; +        out1 = out3 ^ in3; +        out2 = out4 ^ in0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_50(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in2 ^ in7; +        tmp0 = in3 ^ in5; +        out0 = out2 ^ in4 ^ in6; +        out1 = tmp0 ^ in7; +        tmp1 = tmp0 ^ in6; +        out3 = out0 ^ in3; +        out7 = tmp1 ^ in1; +        tmp2 = tmp1 ^ in0; +        out5 = out3 ^ in1 ^ in2; +        out4 = tmp2 ^ in2; +        out6 = tmp2 ^ out3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_51(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in7; +        out3 = in2 ^ in4 ^ in6 ^ in7; +        out0 = out3 ^ in0; +        out6 = out0 ^ in5; +        out4 = out6 ^ in3 ^ in7; +        out1 = out0 ^ out4 ^ in1; +        out7 = out1 ^ in6; +        out5 = out7 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_52(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in1 ^ in2; +        tmp0 = in2 ^ in4; +        tmp1 = in3 ^ in5; +        tmp2 = in3 ^ in6; +        tmp3 = in0 ^ in7; +        out0 = tmp0 ^ in6; +        out6 = tmp0 ^ tmp3; +        out7 = tmp1 ^ in1; +        out1 = tmp1 ^ tmp3; +        out3 = tmp2 ^ in4; +        out5 = tmp2 ^ in1 ^ in7; +        out4 = tmp2 ^ out1 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_53(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in1; +        out3 = in4 ^ in6; +        out0 = out3 ^ in0 ^ in2; +        out6 = out0 ^ in7; +        out4 = out6 ^ in5; +        out7 = out0 ^ out4 ^ in1 ^ in3; +        out1 = out7 ^ in0; +        out5 = out7 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_54(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in3 ^ in5; +        tmp0 = in1 ^ in3; +        tmp1 = in2 ^ in4; +        tmp2 = in0 ^ in7; +        out5 = in1 ^ in4 ^ in6; +        out4 = tmp2 ^ out1; +        out7 = tmp0 ^ in6; +        out3 = tmp0 ^ tmp1; +        out0 = tmp1 ^ in7; +        tmp3 = tmp2 ^ in2; +        out2 = tmp3 ^ in6; +        out6 = tmp3 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_55(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in3; +        tmp1 = in1 ^ in4; +        tmp2 = in6 ^ in7; +        out7 = tmp0 ^ tmp2; +        out1 = tmp0 ^ in5; +        out3 = tmp1 ^ in2; +        out5 = tmp1 ^ in5 ^ in6; +        out2 = tmp2 ^ in0; +        out4 = out5 ^ out7 ^ in0; +        out6 = out2 ^ in2 ^ in5; +        out0 = out5 ^ out6 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_56(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in2 ^ in4; +        tmp0 = in0 ^ in2; +        out4 = in0 ^ in5; +        out7 = in1 ^ in3; +        out5 = in1 ^ in6; +        out6 = tmp0 ^ in7; +        out2 = tmp0 ^ out5; +        out1 = out4 ^ in3; +        out3 = out7 ^ in4 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_57(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in5; +        tmp1 = in1 ^ in7; +        out0 = in0 ^ in2 ^ in4; +        out5 = in1 ^ in5 ^ in6; +        out4 = tmp0 ^ in4; +        out1 = tmp0 ^ in1 ^ in3; +        out2 = tmp0 ^ out5; +        out3 = tmp1 ^ in4; +        out7 = tmp1 ^ in3; +        out6 = tmp1 ^ out2 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_58(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in2 ^ in5; +        tmp0 = in2 ^ in3 ^ in4; +        out5 = tmp0 ^ in1; +        out6 = tmp0 ^ in0 ^ in5; +        out3 = out6 ^ in7; +        tmp1 = out2 ^ out5; +        out7 = tmp1 ^ in6; +        out4 = tmp1 ^ out3 ^ in3; +        out0 = out4 ^ out7 ^ in0; +        out1 = tmp0 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_59(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in5; +        tmp0 = in0 ^ in5 ^ in7; +        out3 = tmp0 ^ in2 ^ in4; +        out0 = out3 ^ in6; +        tmp1 = out0 ^ in7; +        out6 = tmp1 ^ in3; +        out5 = out6 ^ in0 ^ in1 ^ in6; +        out4 = tmp0 ^ out5; +        out1 = tmp1 ^ out4; +        out7 = out1 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_5A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = in2 ^ in5; +        out5 = tmp0 ^ in3; +        out4 = tmp0 ^ in0; +        tmp2 = tmp1 ^ in4; +        out2 = tmp1 ^ in1 ^ in7; +        out7 = tmp2 ^ out5; +        out6 = out4 ^ out7 ^ in5; +        out0 = tmp2 ^ in6; +        out1 = out0 ^ out6 ^ in7; +        out3 = tmp1 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_5B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        tmp1 = in0 ^ in4; +        tmp2 = in1 ^ in5; +        out5 = tmp0 ^ tmp2; +        tmp3 = tmp1 ^ in6; +        out3 = tmp1 ^ in5; +        out2 = tmp2 ^ in7; +        tmp4 = out3 ^ in2; +        out7 = out2 ^ in3 ^ in4; +        out0 = tmp4 ^ in6; +        out6 = tmp0 ^ tmp3; +        out4 = tmp2 ^ tmp4; +        out1 = tmp3 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_5C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in6; +        tmp1 = in0 ^ in2 ^ in5; +        out1 = tmp0 ^ in5; +        tmp2 = tmp0 ^ in1; +        out2 = tmp1 ^ in6; +        out6 = tmp1 ^ in3; +        out4 = tmp2 ^ in0; +        out7 = tmp2 ^ in4; +        out3 = tmp1 ^ out7; +        out0 = out3 ^ out4 ^ in7; +        out5 = out0 ^ in1 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_5D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in0 ^ in6; +        out2 = tmp1 ^ in5; +        tmp2 = out2 ^ in3; +        out6 = tmp2 ^ in2; +        out1 = tmp0 ^ tmp2; +        tmp3 = out1 ^ in4 ^ in5; +        out4 = tmp3 ^ in0; +        out7 = tmp3 ^ in7; +        tmp4 = out4 ^ out6; +        out5 = tmp4 ^ in7; +        out0 = tmp0 ^ out5; +        out3 = tmp1 ^ tmp4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_5E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        tmp1 = in3 ^ in5; +        tmp2 = in1 ^ in7; +        out7 = in1 ^ in3 ^ in4; +        out0 = tmp0 ^ in4; +        tmp3 = tmp1 ^ in0; +        out5 = tmp2 ^ in2; +        out1 = tmp3 ^ in6; +        out6 = tmp0 ^ tmp3; +        tmp4 = tmp2 ^ out1; +        out3 = tmp4 ^ in4; +        out4 = tmp1 ^ tmp4; +        out2 = tmp0 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_5F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in5; +        tmp1 = in0 ^ in6; +        tmp2 = tmp0 ^ in7; +        tmp3 = tmp1 ^ in3; +        out2 = tmp1 ^ tmp2; +        out5 = tmp2 ^ in2; +        out6 = tmp3 ^ in2; +        out3 = out2 ^ in4; +        out4 = out3 ^ in5; +        out1 = tmp0 ^ tmp3; +        out7 = tmp3 ^ out4; +        out0 = out4 ^ out5 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_60(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in2 ^ in5; +        tmp0 = in3 ^ in6; +        out1 = in3 ^ in4 ^ in7; +        out7 = out4 ^ in1; +        tmp1 = out4 ^ in4; +        out0 = tmp0 ^ in2; +        out5 = tmp0 ^ in0; +        out2 = tmp0 ^ tmp1; +        out3 = tmp1 ^ in7; +        out6 = out3 ^ out7 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_61(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        out4 = tmp0 ^ in4; +        tmp1 = out4 ^ in3; +        out3 = tmp1 ^ in7; +        out2 = tmp1 ^ in2 ^ in6; +        out1 = tmp0 ^ out3 ^ in1; +        out0 = out2 ^ out4 ^ in0; +        out7 = tmp1 ^ out1; +        out6 = out0 ^ out1 ^ in2; +        out5 = tmp0 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_62(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in4 ^ in5; +        tmp0 = in0 ^ in3 ^ in4; +        out1 = tmp0 ^ in7; +        out5 = tmp0 ^ in6; +        tmp1 = out1 ^ in0; +        tmp2 = tmp1 ^ out3; +        out4 = tmp2 ^ in2; +        tmp3 = tmp2 ^ in1; +        out0 = out4 ^ in5 ^ in6; +        out7 = tmp3 ^ out0; +        out6 = tmp0 ^ tmp3; +        out2 = tmp1 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_63(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in4; +        tmp1 = in1 ^ in7; +        out3 = tmp0 ^ in5; +        tmp2 = out3 ^ in6; +        out4 = out3 ^ in2 ^ in7; +        out5 = tmp2 ^ in0; +        tmp3 = out5 ^ in3; +        out0 = tmp3 ^ out4; +        out2 = tmp1 ^ tmp2; +        out6 = tmp1 ^ tmp3; +        tmp4 = tmp0 ^ out2; +        out1 = tmp4 ^ out5; +        out7 = tmp4 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_64(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in2 ^ in3; +        out1 = in3 ^ in4; +        out7 = in1 ^ in2; +        tmp0 = in4 ^ in5; +        tmp1 = in0 ^ in7; +        out4 = in5 ^ in6 ^ in7; +        out2 = tmp0 ^ out0 ^ in0; +        out3 = tmp0 ^ out7 ^ in6; +        out5 = tmp1 ^ in6; +        out6 = tmp1 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_65(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = in4 ^ in5; +        tmp2 = in6 ^ in7; +        out7 = in1 ^ in2 ^ in7; +        out1 = in1 ^ in3 ^ in4; +        out0 = tmp0 ^ in2; +        out2 = tmp0 ^ tmp1; +        out4 = tmp1 ^ tmp2; +        tmp3 = tmp2 ^ in0; +        out3 = out4 ^ out7 ^ in3; +        out5 = tmp3 ^ in5; +        out6 = tmp3 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_66(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = in2 ^ in3; +        tmp2 = in0 ^ in4; +        out7 = tmp0 ^ in6; +        out0 = tmp1 ^ in7; +        out1 = tmp2 ^ in3; +        tmp3 = tmp2 ^ in6; +        tmp4 = out1 ^ in5; +        out5 = tmp3 ^ in7; +        out4 = tmp3 ^ tmp4; +        out2 = tmp0 ^ tmp4 ^ in7; +        out6 = tmp1 ^ out2 ^ in4; +        out3 = tmp3 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_67(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = tmp0 ^ in1; +        tmp2 = tmp0 ^ in7; +        out1 = tmp1 ^ in4; +        out0 = tmp2 ^ in2; +        tmp3 = out1 ^ in7; +        out2 = tmp3 ^ in5; +        out3 = out2 ^ in0 ^ in6; +        out7 = tmp1 ^ out0 ^ in6; +        out5 = tmp1 ^ out3; +        out4 = tmp2 ^ out5; +        out6 = tmp3 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_68(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in4; +        tmp1 = in2 ^ in3 ^ in5; +        tmp2 = tmp0 ^ in1; +        tmp3 = tmp0 ^ in6; +        out0 = tmp1 ^ in6; +        out6 = tmp2 ^ in0; +        out7 = tmp1 ^ tmp2; +        out1 = tmp3 ^ in7; +        out2 = out1 ^ in2; +        out4 = tmp2 ^ out2; +        out3 = out4 ^ out6 ^ in3; +        out5 = tmp3 ^ out3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_69(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in6 ^ in7; +        out2 = tmp0 ^ in3 ^ in4; +        out1 = out2 ^ in1; +        out3 = out2 ^ in0 ^ in2; +        out4 = out1 ^ in2 ^ in3; +        out6 = out1 ^ in0 ^ in7; +        out7 = out4 ^ in5 ^ in6; +        out5 = out4 ^ out6 ^ in5; +        out0 = tmp0 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_6A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in6; +        out3 = in0 ^ in4 ^ in6; +        tmp1 = tmp0 ^ in3; +        out4 = tmp1 ^ in1; +        tmp2 = tmp1 ^ in7; +        out2 = out4 ^ in4; +        out0 = tmp2 ^ in5; +        out5 = tmp2 ^ out3; +        out7 = out2 ^ in3 ^ in5; +        out1 = tmp0 ^ out5; +        out6 = tmp1 ^ out7 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_6B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in6; +        out2 = tmp0 ^ in1 ^ in3; +        out4 = out2 ^ in2; +        tmp1 = out2 ^ in0; +        out7 = out4 ^ in3 ^ in5 ^ in7; +        out1 = tmp1 ^ in7; +        out3 = tmp1 ^ in1; +        out6 = tmp1 ^ in5; +        out0 = tmp1 ^ out7 ^ in6; +        out5 = tmp0 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_6C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in1; +        tmp0 = in2 ^ in3; +        out5 = in0 ^ in2; +        out1 = in3 ^ in4 ^ in6; +        tmp1 = out5 ^ in1; +        out0 = tmp0 ^ in5; +        out6 = tmp0 ^ tmp1; +        out3 = tmp1 ^ in4; +        out7 = out3 ^ in0; +        out2 = out6 ^ out7 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_6D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in1 ^ in4; +        tmp0 = in0 ^ in2; +        tmp1 = out4 ^ in3; +        out7 = out4 ^ in2 ^ in7; +        out5 = tmp0 ^ in5; +        out3 = tmp0 ^ tmp1; +        out1 = tmp1 ^ in6; +        out0 = out5 ^ in3; +        out2 = out3 ^ out7 ^ in4; +        out6 = out1 ^ in0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_6E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in3; +        tmp1 = in0 ^ in4; +        out4 = tmp0 ^ in7; +        out6 = tmp0 ^ in0 ^ in5; +        out5 = tmp1 ^ in2; +        tmp2 = tmp1 ^ in3; +        out3 = tmp2 ^ out4; +        out1 = tmp2 ^ in6; +        out2 = tmp0 ^ out5; +        out0 = out2 ^ out3 ^ in5; +        out7 = out1 ^ out2 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_6F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in7; +        tmp1 = tmp0 ^ in4; +        tmp2 = tmp0 ^ in0 ^ in2; +        out4 = tmp1 ^ in1; +        out0 = tmp2 ^ in5; +        out3 = out4 ^ in0; +        out2 = out3 ^ in7; +        out1 = out2 ^ in6; +        out6 = out1 ^ in4 ^ in5; +        out7 = tmp2 ^ out1; +        out5 = tmp1 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_70(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in2; +        tmp0 = in2 ^ in4; +        out2 = in2 ^ in3 ^ in5; +        tmp1 = tmp0 ^ in6; +        tmp2 = out2 ^ in7; +        out0 = tmp1 ^ in3; +        out4 = tmp1 ^ in0; +        out7 = tmp2 ^ in1; +        out6 = out4 ^ in1; +        out5 = out7 ^ in0 ^ in2; +        out1 = tmp0 ^ tmp2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_71(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in3 ^ in5; +        out3 = in2 ^ in3; +        tmp0 = in0 ^ in2; +        tmp1 = out2 ^ in1; +        out4 = tmp0 ^ in6; +        tmp2 = tmp0 ^ in1; +        out7 = tmp1 ^ in2; +        out1 = tmp1 ^ in4 ^ in7; +        out0 = out4 ^ in3 ^ in4; +        out6 = tmp2 ^ in4; +        out5 = tmp2 ^ out3 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_72(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in7; +        tmp0 = in0 ^ in4; +        tmp1 = tmp0 ^ in3 ^ in7; +        out1 = tmp1 ^ in5; +        out5 = out1 ^ in1; +        tmp2 = tmp0 ^ out5; +        out2 = tmp2 ^ in2; +        out7 = out2 ^ in6; +        out6 = tmp1 ^ out7; +        out4 = tmp2 ^ out6; +        out0 = out4 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_73(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in3 ^ in7; +        out2 = out3 ^ in1 ^ in5; +        out1 = out2 ^ in0 ^ in4; +        out5 = out1 ^ in5; +        out6 = out1 ^ out3 ^ in2; +        out0 = out2 ^ out6 ^ in6; +        out7 = out0 ^ out1 ^ in3; +        out4 = out0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_74(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in4; +        tmp1 = in1 ^ in2 ^ in6; +        out4 = in0 ^ in4 ^ in7; +        out5 = in0 ^ in1 ^ in5; +        out0 = tmp0 ^ in2; +        out1 = tmp0 ^ in5; +        out3 = tmp1 ^ in7; +        out6 = tmp1 ^ in0; +        out2 = tmp1 ^ out5 ^ in3; +        out7 = out3 ^ in3 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_75(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in0 ^ in7; +        tmp0 = in1 ^ in3; +        out5 = in0 ^ in1; +        out7 = tmp0 ^ in2; +        tmp1 = tmp0 ^ in4; +        out6 = out5 ^ in2; +        tmp2 = out7 ^ in6; +        out1 = tmp1 ^ in5; +        out0 = tmp1 ^ out6; +        out3 = tmp2 ^ in7; +        out2 = tmp2 ^ out6 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_76(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in1 ^ in6; +        tmp0 = in0 ^ in5; +        tmp1 = in3 ^ in7; +        tmp2 = tmp0 ^ in4; +        tmp3 = tmp1 ^ in2; +        out5 = tmp2 ^ in1; +        out1 = tmp2 ^ in3; +        out0 = tmp3 ^ in4; +        out4 = out1 ^ in5; +        out7 = tmp3 ^ out3; +        out2 = tmp0 ^ out7; +        out6 = tmp1 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_77(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in0 ^ in3; +        tmp0 = in1 ^ in4; +        tmp1 = in1 ^ in6; +        tmp2 = out4 ^ in5; +        out5 = tmp0 ^ in0; +        out1 = tmp0 ^ tmp2; +        out3 = tmp1 ^ in3; +        out2 = tmp1 ^ tmp2 ^ in7; +        out7 = out3 ^ in2; +        tmp3 = out7 ^ in6; +        out6 = tmp2 ^ tmp3; +        out0 = tmp3 ^ out5 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_78(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = in2 ^ in7; +        tmp2 = in0 ^ in5 ^ in6; +        out2 = tmp1 ^ in3; +        out3 = tmp2 ^ in2; +        out5 = out3 ^ in1 ^ in3; +        out0 = tmp0 ^ out3 ^ in4; +        out1 = tmp1 ^ out0; +        out4 = out1 ^ out5 ^ in5; +        out7 = tmp0 ^ out4; +        out6 = tmp2 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_79(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in3 ^ in7; +        tmp0 = in3 ^ in4; +        tmp1 = in1 ^ in5; +        tmp2 = tmp1 ^ in2; +        out4 = tmp2 ^ in0 ^ in7; +        tmp3 = out4 ^ in5; +        out5 = tmp3 ^ out2 ^ in6; +        out7 = tmp0 ^ tmp2; +        out6 = tmp0 ^ tmp3; +        out3 = tmp1 ^ out5; +        out0 = out3 ^ in4; +        out1 = tmp3 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_7A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        out2 = tmp0 ^ in3; +        tmp1 = out2 ^ in4; +        out4 = tmp1 ^ in0 ^ in5; +        out5 = out4 ^ in6; +        out6 = out5 ^ in7; +        out7 = out6 ^ in0; +        out0 = out7 ^ in1; +        out1 = tmp0 ^ out6; +        out3 = tmp1 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_7B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in1 ^ in3; +        tmp0 = in0 ^ in5; +        out4 = tmp0 ^ out2 ^ in2; +        tmp1 = out4 ^ in4; +        out6 = tmp1 ^ in7; +        out5 = tmp1 ^ in5 ^ in6; +        out0 = out6 ^ in1 ^ in6; +        tmp2 = out0 ^ in2; +        out1 = tmp2 ^ in1; +        out3 = tmp2 ^ in4; +        out7 = tmp0 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_7C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in5; +        tmp1 = tmp0 ^ in4; +        out0 = tmp1 ^ in2; +        out1 = tmp1 ^ in6; +        out7 = out0 ^ in1 ^ in5 ^ in7; +        out5 = out1 ^ out7 ^ in0; +        out3 = out5 ^ in6; +        out6 = tmp0 ^ out5; +        out2 = out6 ^ in1; +        out4 = out2 ^ out7 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_7D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = tmp0 ^ in3; +        tmp2 = tmp0 ^ in6; +        out7 = tmp1 ^ in4; +        tmp3 = tmp2 ^ in0; +        out5 = tmp3 ^ in7; +        out4 = tmp3 ^ in2 ^ in5; +        out2 = tmp1 ^ out5; +        out6 = tmp2 ^ out2; +        out0 = out4 ^ out7 ^ in6; +        out1 = tmp3 ^ out0; +        out3 = out6 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_7E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in4; +        tmp1 = in0 ^ in5; +        out1 = tmp0 ^ tmp1 ^ in6; +        out3 = tmp1 ^ in1; +        out4 = out1 ^ in1 ^ in7; +        tmp2 = out4 ^ in3; +        out5 = tmp2 ^ in2; +        out6 = tmp0 ^ out5; +        out7 = tmp1 ^ out4 ^ in2; +        out2 = out6 ^ in5 ^ in7; +        out0 = tmp2 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_7F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in7; +        tmp1 = tmp0 ^ in3 ^ in5; +        tmp2 = tmp1 ^ in0; +        out0 = tmp2 ^ in4; +        out6 = tmp2 ^ in1; +        out3 = tmp0 ^ out6; +        tmp3 = out3 ^ in6; +        out1 = tmp3 ^ in4; +        out2 = tmp3 ^ in5; +        out4 = tmp3 ^ in7; +        out5 = tmp1 ^ out1; +        out7 = out0 ^ out4 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_80(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        tmp1 = in4 ^ in5; +        out1 = in2 ^ in6 ^ in7; +        out5 = tmp0 ^ in4; +        tmp2 = tmp0 ^ in1; +        out6 = tmp1 ^ in3; +        out7 = tmp1 ^ in0 ^ in6; +        out4 = tmp2 ^ in7; +        out3 = tmp2 ^ out6; +        out2 = out3 ^ out5 ^ in6; +        out0 = out2 ^ in3 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_81(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in6; +        tmp1 = tmp0 ^ in3; +        out6 = tmp1 ^ in5; +        out5 = out6 ^ in2 ^ in6; +        out3 = out5 ^ in1; +        out2 = tmp0 ^ out3; +        out1 = out3 ^ out6 ^ in7; +        out4 = tmp1 ^ out1; +        out7 = out2 ^ out4 ^ in0; +        out0 = out7 ^ in1 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_82(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in1 ^ in2; +        tmp0 = in6 ^ in7; +        out5 = in2 ^ in3; +        out6 = in3 ^ in4; +        out7 = in0 ^ in4 ^ in5; +        out0 = in1 ^ in5 ^ in6; +        out1 = tmp0 ^ in0 ^ in2; +        out2 = tmp0 ^ in3 ^ in5; +        out3 = tmp0 ^ out0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_83(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in2 ^ in5; +        tmp2 = in3 ^ in6; +        out4 = in1 ^ in2 ^ in4; +        out0 = tmp0 ^ in5 ^ in6; +        out5 = tmp1 ^ in3; +        tmp3 = tmp1 ^ in7; +        out6 = tmp2 ^ in4; +        out2 = tmp2 ^ tmp3; +        tmp4 = tmp3 ^ out4; +        out1 = tmp3 ^ out0; +        out3 = tmp4 ^ in3; +        out7 = tmp0 ^ tmp4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_84(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in2 ^ in6; +        out6 = in3 ^ in5; +        out0 = in1 ^ in5 ^ in7; +        out7 = in0 ^ in4 ^ in6; +        out4 = in1 ^ in3 ^ in6; +        out5 = in2 ^ in4 ^ in7; +        out2 = out6 ^ in0 ^ in1; +        out3 = out5 ^ in5 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_85(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in6; +        tmp1 = in3 ^ in6; +        tmp2 = tmp0 ^ in4; +        out1 = tmp0 ^ in2; +        out6 = tmp1 ^ in5; +        out4 = tmp2 ^ in3; +        tmp3 = out1 ^ out6; +        out2 = tmp3 ^ in0; +        out3 = tmp2 ^ tmp3 ^ in7; +        out7 = out2 ^ out3 ^ in1; +        out5 = tmp1 ^ out3; +        out0 = tmp2 ^ out7 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_86(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out6 = in3; +        out7 = in0 ^ in4; +        out0 = in1 ^ in5; +        out5 = in2 ^ in7; +        out3 = in4 ^ in5 ^ in6; +        out1 = in0 ^ in2 ^ in6; +        out4 = in1 ^ in6 ^ in7; +        out2 = in0 ^ in3 ^ in5 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_87(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out6 = in3 ^ in6; +        tmp0 = in0 ^ in1; +        out7 = in0 ^ in4 ^ in7; +        out5 = in2 ^ in5 ^ in7; +        out3 = out6 ^ in4 ^ in5; +        out0 = tmp0 ^ in5; +        tmp1 = tmp0 ^ in6; +        out2 = out5 ^ in0 ^ in3; +        out1 = tmp1 ^ in2; +        out4 = tmp1 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_88(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in2 ^ in7; +        tmp0 = in5 ^ in6; +        out0 = in1 ^ in6 ^ in7; +        out6 = in4 ^ in5 ^ in7; +        out3 = out0 ^ out1 ^ in0 ^ in4; +        out7 = tmp0 ^ in0; +        tmp1 = tmp0 ^ in3; +        out2 = out0 ^ in3; +        out4 = tmp1 ^ in2; +        out5 = tmp1 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_89(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in7; +        tmp1 = in2 ^ in7; +        tmp2 = tmp0 ^ in6; +        out1 = tmp1 ^ in1; +        out7 = tmp2 ^ in5; +        out0 = tmp2 ^ in1; +        out2 = out1 ^ in3 ^ in6; +        out6 = out7 ^ in0 ^ in4; +        out5 = out6 ^ in3; +        out3 = tmp0 ^ out2 ^ in4; +        out4 = tmp1 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_8A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in1 ^ in6; +        out7 = in0 ^ in5; +        out2 = in3 ^ in6; +        out6 = in4 ^ in7; +        out1 = in0 ^ in2 ^ in7; +        out3 = out0 ^ out6 ^ in0; +        out4 = out1 ^ out7 ^ in6; +        out5 = out2 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_8B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in3 ^ in6; +        tmp2 = in5 ^ in7; +        tmp3 = tmp0 ^ in7; +        out0 = tmp0 ^ in6; +        out2 = tmp1 ^ in2; +        out5 = tmp1 ^ tmp2; +        out7 = tmp2 ^ in0; +        tmp4 = tmp3 ^ in4; +        out1 = tmp3 ^ in2; +        out6 = tmp4 ^ out0; +        out4 = out6 ^ in2 ^ in5; +        out3 = tmp1 ^ tmp4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_8C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in2; +        out0 = in1 ^ in7; +        out7 = in0 ^ in6; +        out5 = in4 ^ in6; +        out6 = in5 ^ in7; +        out2 = out0 ^ in0 ^ in3; +        out3 = out5 ^ out7 ^ in2 ^ in7; +        out4 = out6 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_8D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in1 ^ in2; +        tmp0 = in6 ^ in7; +        out0 = in0 ^ in1 ^ in7; +        out5 = in4 ^ in5 ^ in6; +        out6 = tmp0 ^ in5; +        out7 = tmp0 ^ in0; +        out4 = tmp0 ^ out5 ^ in3; +        out2 = out0 ^ in2 ^ in3; +        out3 = out2 ^ in1 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_8E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in1; +        out4 = in5; +        out7 = in0; +        out5 = in6; +        out6 = in7; +        out3 = in0 ^ in4; +        out1 = in0 ^ in2; +        out2 = in0 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_8F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in0 ^ in1; +        tmp0 = in0 ^ in3; +        out4 = in4 ^ in5; +        out7 = in0 ^ in7; +        out5 = in5 ^ in6; +        out6 = in6 ^ in7; +        out1 = out0 ^ in2; +        out2 = tmp0 ^ in2; +        out3 = tmp0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_90(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = in2 ^ in6 ^ in7; +        out3 = tmp0 ^ in7; +        out1 = tmp1 ^ in5; +        tmp2 = out1 ^ in4; +        out6 = tmp2 ^ in3; +        out5 = out6 ^ in1; +        out4 = out5 ^ in0; +        out0 = tmp0 ^ tmp2; +        out7 = tmp0 ^ out4; +        out2 = tmp1 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_91(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in4; +        tmp1 = tmp0 ^ in3 ^ in5; +        out2 = tmp1 ^ in1; +        out6 = tmp1 ^ in7; +        tmp2 = out2 ^ in5 ^ in7; +        out3 = tmp2 ^ in4; +        out5 = tmp2 ^ in6; +        out1 = tmp1 ^ out5 ^ in2; +        tmp3 = out1 ^ in0; +        out4 = tmp3 ^ in3; +        out0 = tmp0 ^ tmp3; +        out7 = tmp2 ^ tmp3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_92(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in1; +        tmp0 = in4 ^ in5; +        tmp1 = tmp0 ^ in1; +        out2 = tmp0 ^ in3 ^ in7; +        out0 = tmp1 ^ in6; +        out7 = out2 ^ in0; +        out4 = out0 ^ in0 ^ in2; +        out5 = out4 ^ out7 ^ in5; +        out6 = tmp1 ^ out5; +        out1 = out6 ^ out7 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_93(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in1 ^ in3; +        tmp0 = in2 ^ in7; +        tmp1 = out3 ^ in6; +        tmp2 = tmp0 ^ in4; +        out5 = tmp0 ^ tmp1; +        out6 = tmp2 ^ in3; +        out2 = out6 ^ in5; +        out0 = out2 ^ out5 ^ in0; +        out7 = tmp1 ^ out0; +        out1 = tmp2 ^ out0; +        out4 = out1 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_94(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in2 ^ in6; +        tmp0 = in1 ^ in4 ^ in5; +        out1 = out3 ^ in5; +        out5 = tmp0 ^ out3; +        out0 = tmp0 ^ in7; +        out4 = tmp0 ^ in0 ^ in3; +        out6 = out1 ^ in3 ^ in7; +        out2 = out4 ^ in6; +        out7 = out0 ^ out2 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_95(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        out3 = tmp0 ^ in6; +        tmp1 = tmp0 ^ in7; +        tmp2 = out3 ^ in0; +        out6 = tmp1 ^ in5; +        tmp3 = tmp2 ^ in4; +        out7 = tmp3 ^ in2; +        tmp4 = tmp3 ^ in5; +        out2 = tmp4 ^ in1; +        tmp5 = out2 ^ in6; +        out0 = tmp1 ^ tmp5; +        out1 = tmp5 ^ out7; +        out4 = tmp2 ^ out1; +        out5 = tmp4 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_96(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in6 ^ in7; +        tmp0 = in1 ^ in5; +        tmp1 = in5 ^ in6; +        out6 = out3 ^ in2 ^ in3; +        out0 = tmp0 ^ in4; +        tmp2 = tmp1 ^ in2; +        out4 = out0 ^ in0 ^ in7; +        out1 = tmp2 ^ in0; +        out5 = tmp2 ^ in1; +        out7 = tmp0 ^ out4 ^ in3; +        out2 = tmp1 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_97(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in4; +        tmp1 = in2 ^ in6; +        out3 = in3 ^ in6 ^ in7; +        out7 = tmp0 ^ in3; +        tmp2 = tmp0 ^ in5; +        out5 = tmp1 ^ in1; +        out6 = tmp1 ^ out3; +        out0 = tmp2 ^ in1; +        out2 = tmp2 ^ out3 ^ in2; +        tmp3 = out0 ^ in4; +        out4 = tmp3 ^ in7; +        out1 = tmp1 ^ tmp3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_98(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in5 ^ in7; +        tmp1 = in1 ^ in4 ^ in7; +        out1 = tmp0 ^ in2; +        out0 = tmp1 ^ in6; +        out2 = tmp1 ^ in3; +        out6 = out0 ^ out1 ^ in1; +        out5 = tmp0 ^ out2; +        out3 = tmp1 ^ out6 ^ in0; +        out7 = out0 ^ out5 ^ in0; +        out4 = out6 ^ out7 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_99(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        out5 = in1 ^ in3 ^ in4; +        out6 = in2 ^ in4 ^ in5; +        out4 = tmp0 ^ in2; +        tmp1 = tmp0 ^ in6; +        tmp2 = out5 ^ in7; +        out7 = tmp1 ^ in5; +        out0 = tmp1 ^ tmp2; +        out2 = tmp2 ^ in2; +        out3 = out0 ^ out6 ^ in3; +        out1 = tmp1 ^ out3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_9A(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in3 ^ in4; +        tmp0 = in0 ^ in5; +        tmp1 = in1 ^ in6; +        out5 = in1 ^ in3 ^ in5; +        tmp2 = tmp0 ^ in7; +        out3 = tmp0 ^ tmp1; +        out0 = tmp1 ^ in4; +        out7 = tmp2 ^ in3; +        out1 = tmp2 ^ in2; +        out6 = out0 ^ in1 ^ in2; +        out4 = out1 ^ in4 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_9B(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in1 ^ in3; +        tmp0 = in3 ^ in5; +        out6 = in2 ^ in4; +        out4 = in0 ^ in2 ^ in7; +        out7 = tmp0 ^ in0; +        out2 = out6 ^ in3; +        out1 = out4 ^ in1 ^ in5; +        out3 = out7 ^ in1 ^ in6; +        out0 = tmp0 ^ out3 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_9C(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out1 = in2 ^ in5; +        tmp0 = in0 ^ in3 ^ in6; +        out3 = out1 ^ in0; +        out6 = out1 ^ in6; +        out7 = tmp0 ^ in7; +        out4 = out7 ^ in4; +        out2 = out4 ^ in1; +        out0 = tmp0 ^ out2; +        out5 = out0 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_9D(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out6 = in2 ^ in5; +        tmp0 = in0 ^ in3; +        out5 = in1 ^ in4 ^ in7; +        out1 = out6 ^ in1; +        out3 = tmp0 ^ out6; +        out7 = tmp0 ^ in6; +        out0 = out5 ^ in0; +        out4 = out7 ^ in7; +        out2 = out5 ^ out7 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_9E(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in1 ^ in4; +        tmp0 = in0 ^ in5; +        out6 = in2 ^ in6; +        out7 = in0 ^ in3 ^ in7; +        out4 = in0 ^ in4 ^ in6; +        out5 = in1 ^ in5 ^ in7; +        out1 = tmp0 ^ in2; +        out3 = tmp0 ^ in7; +        out2 = out4 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_9F(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out6 = in2; +        out7 = in0 ^ in3; +        tmp0 = in0 ^ in1; +        out4 = in0 ^ in6; +        out5 = in1 ^ in7; +        out1 = tmp0 ^ in2 ^ in5; +        out2 = out7 ^ in2 ^ in4 ^ in6; +        out3 = out7 ^ in5 ^ in7; +        out0 = tmp0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A0(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in6; +        out2 = tmp0 ^ in7; +        tmp1 = tmp0 ^ in5; +        out6 = out2 ^ in3 ^ in4; +        out0 = tmp1 ^ in3; +        tmp2 = out0 ^ in2; +        out3 = tmp2 ^ in7; +        tmp3 = tmp2 ^ in1; +        out5 = tmp3 ^ in0; +        out4 = tmp3 ^ out6; +        out7 = out5 ^ out6 ^ in1; +        out1 = tmp1 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A1(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        tmp1 = tmp0 ^ in1; +        tmp2 = tmp0 ^ in4; +        out4 = tmp1 ^ in7; +        out7 = tmp2 ^ in0; +        out6 = tmp2 ^ out4 ^ in3; +        out3 = out4 ^ in6; +        out2 = out3 ^ in5; +        out1 = out2 ^ in4; +        out5 = out1 ^ out6 ^ in0; +        out0 = tmp1 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A2(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in6; +        tmp0 = in1 ^ in3 ^ in5; +        out3 = tmp0 ^ in6; +        out4 = tmp0 ^ in2 ^ in4; +        out0 = out3 ^ in7; +        out6 = out0 ^ in4; +        out1 = out0 ^ out4 ^ in0; +        out7 = out1 ^ in5; +        out5 = out7 ^ in3 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A3(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in2 ^ in6; +        out3 = in1 ^ in5 ^ in6; +        tmp0 = out2 ^ in0; +        out4 = out2 ^ out3 ^ in3; +        tmp1 = tmp0 ^ in4; +        out0 = tmp0 ^ out4 ^ in7; +        out5 = tmp1 ^ in3; +        out7 = tmp1 ^ in5; +        out1 = tmp1 ^ in1 ^ in7; +        out6 = tmp1 ^ out0 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A4(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in3; +        tmp1 = in2 ^ in4; +        tmp2 = in2 ^ in5; +        tmp3 = in0 ^ in7; +        out0 = tmp0 ^ in5; +        out6 = tmp0 ^ in6 ^ in7; +        out1 = tmp1 ^ in6; +        out7 = tmp1 ^ tmp3; +        out3 = tmp2 ^ in3; +        tmp4 = tmp2 ^ out1; +        out2 = tmp3 ^ in1; +        out5 = tmp4 ^ out7; +        out4 = tmp4 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A5(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in2 ^ in5; +        tmp0 = in1 ^ in6; +        tmp1 = in0 ^ in1; +        tmp2 = in2 ^ in4; +        out6 = in1 ^ in3 ^ in7; +        out4 = tmp0 ^ in5; +        out1 = tmp0 ^ tmp2; +        out0 = tmp1 ^ in3 ^ in5; +        out2 = tmp1 ^ in2 ^ in7; +        out7 = tmp2 ^ in0; +        out5 = tmp0 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A6(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0; +        out3 = in3 ^ in5 ^ in7; +        out1 = in0 ^ in2 ^ in4 ^ in6; +        out0 = out3 ^ in1; +        out7 = out1 ^ in7; +        out6 = out0 ^ in6; +        out5 = out7 ^ in5; +        out4 = out6 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A7(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0 ^ in2; +        out3 = in5 ^ in7; +        out7 = out2 ^ in4 ^ in6; +        out6 = out3 ^ in1 ^ in3; +        out1 = out7 ^ in1; +        out5 = out7 ^ in7; +        out0 = out6 ^ in0; +        out4 = out6 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A8(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in4; +        tmp1 = in1 ^ in6; +        tmp2 = in0 ^ in2 ^ in7; +        out1 = tmp0 ^ in7; +        out4 = tmp0 ^ in6; +        out0 = tmp1 ^ in3; +        out2 = tmp1 ^ in5; +        out6 = tmp1 ^ in4; +        out7 = tmp2 ^ in5; +        out3 = tmp2 ^ out0 ^ in6; +        out5 = out7 ^ in2 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_A9(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in2 ^ in6; +        out6 = in1 ^ in4; +        out7 = in0 ^ in2 ^ in5; +        out5 = in0 ^ in3 ^ in7; +        out2 = out4 ^ in1 ^ in5; +        out1 = out6 ^ in2 ^ in7; +        out0 = out2 ^ out7 ^ in3; +        out3 = out1 ^ in0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_AA(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in2; +        tmp1 = in1 ^ in3; +        tmp2 = in6 ^ in7; +        out1 = tmp0 ^ in4 ^ in7; +        out3 = tmp1 ^ in0; +        out0 = tmp1 ^ tmp2; +        out2 = tmp2 ^ in5; +        out7 = tmp0 ^ out2; +        out6 = out1 ^ out7 ^ in1; +        out5 = out0 ^ out6 ^ in0; +        out4 = out5 ^ out7 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_AB(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0 ^ in1; +        tmp0 = in1 ^ in4; +        tmp1 = in0 ^ in7; +        out6 = tmp0 ^ in5; +        out1 = tmp0 ^ tmp1 ^ in2; +        out5 = tmp1 ^ in3 ^ in4; +        out0 = tmp0 ^ out5 ^ in6; +        out4 = out0 ^ out3 ^ in2; +        out2 = out4 ^ in3 ^ in5; +        out7 = tmp1 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_AC(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in1 ^ in3; +        out1 = in2 ^ in4; +        tmp0 = in0 ^ in2; +        out4 = in4 ^ in7; +        out5 = in0 ^ in5; +        out6 = in1 ^ in6; +        out7 = tmp0 ^ in7; +        out3 = tmp0 ^ in3 ^ in6; +        out2 = out5 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_AD(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in7; +        out5 = in0; +        out6 = in1; +        out7 = in0 ^ in2; +        out0 = in0 ^ in1 ^ in3; +        out2 = out7 ^ in1 ^ in5; +        out1 = in1 ^ in2 ^ in4; +        out3 = out7 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_AE(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in3 ^ in4; +        tmp0 = in0 ^ in4; +        tmp1 = in0 ^ in7; +        out0 = in1 ^ in3 ^ in7; +        out1 = tmp0 ^ in2; +        out5 = tmp0 ^ in5; +        tmp2 = tmp1 ^ in6; +        out2 = tmp1 ^ in5; +        out3 = tmp2 ^ in3; +        out7 = tmp2 ^ in2; +        out6 = tmp2 ^ out2 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_AF(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in3; +        tmp0 = in0 ^ in7; +        out5 = in0 ^ in4; +        out6 = in1 ^ in5; +        out7 = in0 ^ in2 ^ in6; +        out0 = tmp0 ^ in1 ^ in3; +        out3 = tmp0 ^ in6; +        out2 = tmp0 ^ in2 ^ in5; +        out1 = out5 ^ in1 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B0(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in4; +        tmp1 = in3 ^ in6; +        out2 = tmp0 ^ in7; +        tmp2 = tmp0 ^ tmp1; +        out0 = tmp2 ^ in5; +        out3 = tmp2 ^ in2; +        out6 = out3 ^ in6; +        tmp3 = out6 ^ in0 ^ in1; +        out7 = tmp3 ^ in5; +        out5 = tmp3 ^ out2; +        out1 = out0 ^ out5 ^ in0; +        out4 = tmp1 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B1(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in4; +        out2 = tmp0 ^ in2 ^ in7; +        tmp1 = out2 ^ in6; +        out1 = tmp1 ^ in5; +        out3 = tmp1 ^ in7; +        out4 = tmp1 ^ in0; +        out6 = out3 ^ in3; +        out0 = out6 ^ in0 ^ in2 ^ in5; +        out5 = tmp1 ^ out0 ^ in1; +        out7 = tmp0 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B2(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in4; +        tmp0 = in4 ^ in7; +        tmp1 = in1 ^ in3 ^ in6; +        out3 = tmp0 ^ tmp1; +        tmp2 = tmp1 ^ in0; +        out0 = out3 ^ in5; +        out4 = tmp2 ^ in2; +        tmp3 = out4 ^ in6; +        out5 = tmp0 ^ tmp3; +        out1 = tmp3 ^ out0; +        tmp4 = out1 ^ in7; +        out7 = tmp4 ^ in3; +        out6 = tmp2 ^ tmp4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B3(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in2 ^ in4; +        tmp0 = in0 ^ in5; +        tmp1 = in1 ^ in6; +        out3 = tmp1 ^ in4 ^ in7; +        tmp2 = tmp0 ^ out3; +        out0 = tmp2 ^ in3; +        out1 = tmp2 ^ in2; +        out5 = out0 ^ in2 ^ in6; +        out7 = tmp1 ^ out5; +        out4 = out7 ^ in1 ^ in5 ^ in7; +        out6 = tmp0 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B4(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in0 ^ in1; +        out5 = out4 ^ in2; +        tmp0 = out4 ^ in4; +        out6 = out5 ^ in0 ^ in3; +        out7 = tmp0 ^ out6; +        out2 = tmp0 ^ in6 ^ in7; +        out3 = out7 ^ in0 ^ in7; +        out0 = out5 ^ out7 ^ in5; +        out1 = out0 ^ out6 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B5(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in2 ^ in4; +        out4 = tmp0 ^ in4; +        out3 = tmp1 ^ in7; +        tmp2 = out4 ^ in5; +        out7 = out3 ^ in0 ^ in3; +        out0 = tmp2 ^ in3; +        out2 = tmp0 ^ out3 ^ in6; +        out5 = tmp1 ^ tmp2; +        out6 = out2 ^ out7 ^ in2; +        out1 = tmp0 ^ out0 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B6(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in3 ^ in4; +        tmp0 = in1 ^ in2; +        tmp1 = in0 ^ in4; +        tmp2 = in3 ^ in5; +        tmp3 = out3 ^ in1 ^ in7; +        out5 = tmp0 ^ tmp1; +        out6 = tmp0 ^ tmp2; +        out2 = tmp1 ^ in6; +        out4 = tmp1 ^ tmp3; +        out0 = tmp3 ^ in5; +        out1 = out2 ^ in2 ^ in5; +        out7 = tmp2 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B7(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in4; +        tmp0 = in0 ^ in4; +        out2 = tmp0 ^ in2 ^ in6; +        tmp1 = out2 ^ in7; +        out1 = out2 ^ in1 ^ in5; +        out7 = tmp1 ^ in3; +        out5 = out1 ^ in6; +        out6 = tmp0 ^ out1 ^ in3; +        out0 = tmp1 ^ out6; +        out4 = out0 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B8(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in4; +        tmp1 = in2 ^ in5; +        out2 = tmp0 ^ in5; +        out4 = tmp1 ^ in0; +        tmp2 = tmp1 ^ in7; +        out6 = tmp2 ^ out2; +        out7 = out4 ^ in3; +        out1 = tmp2 ^ in4; +        out3 = tmp0 ^ out7; +        out0 = out3 ^ out4 ^ in6; +        out5 = out0 ^ in0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_B9(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in2; +        tmp1 = in4 ^ in5; +        out4 = tmp0 ^ tmp1; +        tmp2 = tmp0 ^ in3 ^ in7; +        out3 = out4 ^ in1; +        out7 = tmp2 ^ in5; +        out2 = out3 ^ in0; +        out1 = out2 ^ in7; +        out6 = out1 ^ in5 ^ in6; +        out0 = tmp2 ^ out6; +        out5 = tmp1 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_BA(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in5 ^ in7; +        out2 = tmp0 ^ in4; +        tmp1 = out2 ^ in2; +        out1 = tmp1 ^ in0; +        out6 = tmp1 ^ in1; +        out4 = out1 ^ in3 ^ in4; +        tmp2 = out4 ^ out6; +        out7 = out4 ^ in6 ^ in7; +        out5 = tmp2 ^ in6; +        out3 = tmp0 ^ tmp2; +        out0 = out6 ^ out7 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_BB(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in2 ^ in4 ^ in5 ^ in7; +        tmp0 = out2 ^ in1; +        out4 = out2 ^ in0 ^ in3; +        out1 = tmp0 ^ in0; +        out6 = tmp0 ^ in6; +        out3 = out1 ^ in2; +        tmp1 = out4 ^ out6 ^ in4; +        out0 = tmp1 ^ in7; +        out5 = tmp1 ^ in5; +        out7 = tmp0 ^ tmp1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_BC(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in2; +        tmp1 = in2 ^ in4; +        out0 = in1 ^ in3 ^ in4; +        out6 = in1 ^ in2 ^ in7; +        out7 = tmp0 ^ in3; +        out5 = tmp0 ^ out6 ^ in6; +        out1 = tmp1 ^ in5; +        tmp2 = out1 ^ out5 ^ in1; +        out3 = tmp2 ^ in3; +        out4 = tmp1 ^ tmp2; +        out2 = tmp2 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_BD(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = in1 ^ in4; +        out0 = tmp0 ^ tmp1; +        out7 = tmp0 ^ in2 ^ in7; +        out1 = tmp1 ^ in2 ^ in5; +        tmp2 = out1 ^ in0; +        out2 = tmp2 ^ in6; +        out3 = out2 ^ in1 ^ in7; +        out4 = out3 ^ in2; +        out5 = tmp1 ^ out4; +        out6 = tmp2 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_BE(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3 ^ in6; +        out4 = tmp0 ^ in5; +        out7 = tmp0 ^ in2; +        out3 = out4 ^ in4; +        out1 = out3 ^ out7 ^ in0; +        out2 = out3 ^ in3 ^ in7; +        out0 = out2 ^ out4 ^ in1; +        out5 = tmp0 ^ out0; +        out6 = out1 ^ out5 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_BF(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in4; +        out3 = tmp0 ^ in5 ^ in6; +        out4 = out3 ^ in3; +        tmp1 = out3 ^ in7; +        out2 = tmp1 ^ in2; +        out5 = tmp1 ^ in1; +        tmp2 = out2 ^ in5; +        out7 = tmp2 ^ in3 ^ in4; +        tmp3 = tmp0 ^ out5; +        out0 = tmp3 ^ out4; +        out1 = tmp2 ^ tmp3; +        out6 = tmp3 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C0(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in2 ^ in5; +        tmp0 = in1 ^ in4; +        tmp1 = in3 ^ in6; +        out0 = out5 ^ in1; +        out4 = tmp0 ^ in7; +        out3 = tmp0 ^ tmp1; +        out1 = tmp1 ^ in2; +        out6 = tmp1 ^ in0; +        out7 = out4 ^ in0; +        out2 = out4 ^ out5 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C1(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in2; +        tmp0 = in0 ^ in1; +        out4 = in1 ^ in7; +        out6 = in0 ^ in3; +        out3 = in1 ^ in4 ^ in6; +        tmp1 = tmp0 ^ in2; +        out7 = tmp0 ^ in4; +        out0 = tmp1 ^ in5; +        out1 = tmp1 ^ out6 ^ in6; +        out2 = out6 ^ out7 ^ in5 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C2(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in1 ^ in3 ^ in4; +        tmp0 = in0 ^ in3 ^ in6; +        out5 = in2 ^ in4 ^ in5; +        tmp1 = out4 ^ in7; +        out1 = tmp0 ^ in2; +        out6 = tmp0 ^ in5; +        out2 = out5 ^ in3; +        out7 = tmp0 ^ tmp1; +        out3 = tmp1 ^ in2 ^ in6; +        out0 = tmp1 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C3(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in1 ^ in3; +        tmp0 = in0 ^ in2; +        tmp1 = in3 ^ in5; +        out5 = in2 ^ in4; +        tmp2 = tmp0 ^ out4; +        out2 = tmp1 ^ in4; +        out6 = tmp1 ^ in0; +        out0 = tmp1 ^ tmp2 ^ in7; +        out1 = tmp2 ^ in6; +        out7 = out1 ^ out5 ^ in3; +        out3 = tmp0 ^ out7 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C4(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in7; +        out3 = tmp0 ^ in4; +        tmp1 = tmp0 ^ in2; +        out1 = tmp1 ^ in6; +        out5 = tmp1 ^ in5; +        out4 = out1 ^ out3 ^ in1; +        out0 = out4 ^ in4 ^ in5; +        out2 = out0 ^ out3 ^ in0; +        out7 = out1 ^ out2 ^ in7; +        out6 = tmp1 ^ out0 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C5(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in4 ^ in7; +        tmp0 = in3 ^ in7; +        out4 = in1 ^ in2 ^ in6; +        out6 = in0 ^ in3 ^ in4; +        out5 = tmp0 ^ in2; +        out1 = tmp0 ^ out4; +        out0 = out4 ^ in0 ^ in5; +        out2 = out0 ^ out5 ^ in4; +        out7 = tmp0 ^ out2 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C6(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in5 ^ in6; +        tmp1 = in1 ^ in7; +        tmp2 = tmp0 ^ in0; +        tmp3 = tmp0 ^ tmp1; +        tmp4 = tmp2 ^ in4; +        out0 = tmp3 ^ in2; +        out6 = tmp4 ^ in3; +        out2 = out6 ^ in2; +        out7 = tmp1 ^ tmp4; +        out3 = tmp2 ^ out2; +        tmp5 = out3 ^ in5; +        out5 = tmp5 ^ in7; +        out4 = tmp3 ^ tmp5; +        out1 = tmp4 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C7(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in2 ^ in4; +        tmp0 = in3 ^ in5; +        tmp1 = out3 ^ in7; +        out6 = tmp0 ^ in0 ^ in4; +        out5 = tmp1 ^ in3; +        out2 = out6 ^ in6; +        out7 = out2 ^ in1 ^ in3; +        out0 = tmp1 ^ out7; +        out1 = tmp0 ^ out0; +        out4 = out1 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C8(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out0 = in1 ^ in2; +        out1 = in2 ^ in3; +        tmp0 = in5 ^ in6; +        tmp1 = in0 ^ in7; +        out2 = out1 ^ in1 ^ in4; +        out4 = tmp0 ^ in4; +        out5 = tmp0 ^ in7; +        out6 = tmp1 ^ in6; +        out7 = tmp1 ^ in1; +        out3 = out2 ^ in0 ^ in2 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_C9(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in5 ^ in6; +        out7 = in0 ^ in1; +        tmp0 = in1 ^ in3; +        out5 = in6 ^ in7; +        out6 = in0 ^ in7; +        out0 = out7 ^ in2; +        out3 = out7 ^ in4 ^ in5; +        out1 = tmp0 ^ in2; +        out2 = tmp0 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_CA(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in7; +        tmp1 = in2 ^ in7; +        tmp2 = tmp0 ^ in6; +        out0 = tmp1 ^ in1; +        tmp3 = tmp1 ^ in3; +        out6 = tmp2 ^ in5; +        out7 = tmp2 ^ in1; +        out2 = tmp3 ^ in4; +        out5 = out6 ^ in0 ^ in4; +        out4 = out5 ^ in3; +        out1 = tmp0 ^ tmp3; +        out3 = tmp3 ^ out5 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_CB(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in7; +        tmp1 = in5 ^ in7; +        out7 = in0 ^ in1 ^ in6; +        out5 = tmp0 ^ in6; +        out2 = tmp0 ^ in3; +        out6 = tmp1 ^ in0; +        out4 = tmp1 ^ in3 ^ in6; +        tmp2 = out5 ^ out7 ^ in2; +        out1 = tmp2 ^ out2; +        out0 = tmp2 ^ in4; +        out3 = tmp2 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_CC(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in5; +        tmp1 = in1 ^ in6; +        out1 = in2 ^ in3 ^ in7; +        out5 = tmp0 ^ in6; +        out0 = tmp1 ^ in2; +        tmp2 = out5 ^ in0 ^ in7; +        out3 = tmp2 ^ in4; +        out6 = tmp0 ^ out3; +        out7 = tmp1 ^ tmp2 ^ in3; +        tmp3 = out1 ^ out6; +        out4 = tmp2 ^ tmp3; +        out2 = tmp3 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_CD(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in3 ^ in6; +        tmp0 = in0 ^ in1; +        tmp1 = in2 ^ in7; +        out6 = in0 ^ in4 ^ in7; +        out2 = tmp0 ^ out5 ^ in4; +        out7 = tmp0 ^ in5; +        out0 = tmp0 ^ in2 ^ in6; +        out4 = tmp1 ^ in5; +        out1 = tmp1 ^ in1 ^ in3; +        out3 = out6 ^ in5 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_CE(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in5; +        tmp1 = tmp0 ^ in3; +        out4 = tmp1 ^ in4; +        tmp2 = out4 ^ in6; +        out3 = tmp2 ^ in0; +        out5 = tmp2 ^ in2; +        out2 = out3 ^ in5 ^ in7; +        out6 = tmp1 ^ out2; +        out7 = out2 ^ out4 ^ in1; +        out1 = tmp2 ^ out6; +        out0 = tmp0 ^ out7 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_CF(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in6; +        tmp1 = in0 ^ in1 ^ in5; +        out4 = in2 ^ in3 ^ in5; +        out5 = tmp0 ^ in4; +        out7 = tmp1 ^ in6; +        out1 = tmp1 ^ out4 ^ in7; +        tmp2 = out5 ^ in0; +        out2 = tmp2 ^ in7; +        out3 = tmp2 ^ out4; +        out6 = tmp0 ^ out2 ^ in5; +        out0 = tmp0 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D0(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        tmp1 = in1 ^ in4; +        tmp2 = in2 ^ in5; +        out7 = tmp0 ^ tmp1; +        out0 = tmp1 ^ tmp2; +        tmp3 = tmp2 ^ in3; +        out1 = tmp3 ^ in6; +        tmp4 = out1 ^ in1; +        out2 = tmp4 ^ in7; +        out3 = out2 ^ in2; +        out4 = tmp0 ^ out3; +        out5 = tmp3 ^ out3; +        out6 = tmp4 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D1(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in5 ^ in6; +        tmp1 = tmp0 ^ in1; +        out1 = tmp1 ^ in2; +        out2 = tmp1 ^ in7; +        out3 = out2 ^ in3; +        out5 = out3 ^ in2; +        tmp2 = out3 ^ in0; +        out4 = tmp2 ^ in4; +        out7 = tmp0 ^ out4; +        out6 = tmp2 ^ out1 ^ in6; +        out0 = out2 ^ out6 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D2(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in5 ^ in6; +        out2 = tmp0 ^ in2 ^ in3; +        out1 = out2 ^ in0; +        out3 = out2 ^ in1; +        out4 = out1 ^ in1 ^ in2; +        out6 = out1 ^ in6 ^ in7; +        out7 = out4 ^ in4 ^ in5; +        out5 = out4 ^ out6 ^ in4; +        out0 = tmp0 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D3(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in3 ^ in5 ^ in6; +        tmp0 = out2 ^ in2; +        tmp1 = tmp0 ^ in1; +        out1 = tmp1 ^ in0; +        out3 = tmp1 ^ in3; +        out4 = out1 ^ in2 ^ in4; +        tmp2 = out4 ^ in5; +        out7 = tmp2 ^ in7; +        out0 = tmp0 ^ out7; +        tmp3 = out0 ^ in0; +        out5 = tmp3 ^ in6; +        out6 = tmp2 ^ tmp3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D4(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in3 ^ in5; +        tmp0 = in1 ^ in5; +        tmp1 = tmp0 ^ in2; +        out4 = tmp1 ^ in0; +        tmp2 = tmp1 ^ in6; +        out2 = out4 ^ in3 ^ in7; +        out0 = tmp2 ^ in4; +        out5 = tmp2 ^ out3; +        out1 = tmp0 ^ out5 ^ in7; +        out6 = tmp0 ^ out2 ^ in4; +        out7 = tmp1 ^ out6 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D5(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in5; +        tmp0 = in0 ^ in4; +        tmp1 = tmp0 ^ in1 ^ in5; +        out4 = tmp1 ^ in2; +        out0 = out4 ^ in6; +        tmp2 = tmp0 ^ out0; +        out5 = tmp2 ^ in3; +        out1 = out5 ^ in7; +        out6 = tmp1 ^ out1; +        out7 = tmp2 ^ out6; +        out2 = out7 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D6(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2 ^ in4 ^ in6; +        out5 = tmp0 ^ in3; +        out0 = tmp0 ^ in5 ^ in7; +        out3 = out0 ^ out5 ^ in2; +        tmp1 = out3 ^ in0; +        out1 = tmp1 ^ in6; +        out2 = tmp1 ^ in7; +        out4 = tmp1 ^ in1; +        out6 = tmp1 ^ in4; +        out7 = tmp0 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D7(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in3; +        out3 = in2 ^ in5 ^ in7; +        out2 = tmp0 ^ in5; +        tmp1 = tmp0 ^ out3 ^ in1; +        out1 = tmp1 ^ in6; +        out4 = tmp1 ^ in4; +        tmp2 = out1 ^ in4; +        out6 = tmp2 ^ in1; +        out7 = tmp2 ^ in2; +        out0 = tmp2 ^ in3; +        out5 = tmp2 ^ in0 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D8(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in0; +        out5 = in1; +        tmp0 = in1 ^ in2; +        out6 = in0 ^ in2; +        out0 = tmp0 ^ in4; +        tmp1 = tmp0 ^ in3; +        out7 = tmp1 ^ out6; +        out2 = tmp1 ^ in6; +        out3 = out7 ^ in7; +        out1 = tmp1 ^ in1 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_D9(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in0 ^ in4; +        out5 = in1 ^ in5; +        out2 = in1 ^ in3 ^ in6; +        out3 = in0 ^ in1 ^ in7; +        out6 = in0 ^ in2 ^ in6; +        out0 = out4 ^ in1 ^ in2; +        out1 = out5 ^ in2 ^ in3; +        out7 = out3 ^ in3; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_DA(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out5 = in1 ^ in4; +        tmp0 = in2 ^ in7; +        tmp1 = in0 ^ in2 ^ in3; +        out0 = tmp0 ^ out5; +        out4 = tmp0 ^ tmp1; +        out2 = tmp0 ^ in3 ^ in6; +        out1 = tmp1 ^ in5; +        out3 = tmp1 ^ in1; +        out6 = out1 ^ in3; +        out7 = out3 ^ in2 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_DB(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in1 ^ in5; +        tmp2 = in3 ^ in7; +        out3 = tmp0 ^ in2; +        out5 = tmp1 ^ in4; +        out6 = tmp1 ^ out3 ^ in6; +        out2 = tmp2 ^ in6; +        tmp3 = tmp2 ^ in4; +        tmp4 = out3 ^ in3; +        out4 = tmp3 ^ in0; +        out1 = tmp4 ^ in5; +        out0 = tmp3 ^ tmp4; +        out7 = tmp0 ^ out2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_DC(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in2; +        tmp1 = in0 ^ in3; +        out6 = tmp0 ^ in4; +        tmp2 = tmp0 ^ in7; +        out3 = tmp1 ^ in6; +        tmp3 = tmp1 ^ in1; +        out1 = tmp1 ^ tmp2 ^ in5; +        out4 = tmp2 ^ in6; +        out2 = tmp3 ^ in2; +        out7 = tmp3 ^ in5; +        out5 = tmp2 ^ out2; +        out0 = out2 ^ out3 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_DD(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0 ^ in6; +        out2 = in0 ^ in1 ^ in3; +        out6 = out3 ^ in2 ^ in4; +        out7 = out2 ^ in5 ^ in7; +        out0 = out6 ^ in1; +        out4 = out6 ^ in7; +        out5 = out7 ^ in0; +        out1 = out5 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_DE(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3 ^ in6; +        tmp1 = in3 ^ in4 ^ in7; +        out4 = tmp0 ^ in0; +        out5 = tmp1 ^ in1; +        out3 = out4 ^ in7; +        out2 = out3 ^ in6; +        out1 = out2 ^ in5; +        out6 = tmp1 ^ out1; +        out0 = tmp0 ^ out5; +        out7 = out0 ^ out1 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_DF(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0 ^ in3 ^ in7; +        tmp0 = out2 ^ in1 ^ in5; +        out1 = tmp0 ^ in2; +        out7 = tmp0 ^ in6; +        out5 = tmp0 ^ in0 ^ in4; +        tmp1 = out1 ^ out5 ^ in6; +        out4 = tmp1 ^ in3; +        out6 = tmp1 ^ in5; +        tmp2 = tmp1 ^ in7; +        out0 = tmp2 ^ in1; +        out3 = tmp2 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E0(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in1 ^ in7; +        tmp0 = in2 ^ in4; +        out4 = out3 ^ in3 ^ in5; +        out2 = tmp0 ^ in1; +        tmp1 = tmp0 ^ in6; +        out0 = out4 ^ in2; +        out6 = out4 ^ in0; +        out1 = tmp1 ^ in3; +        out5 = tmp1 ^ in0; +        out7 = out5 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E1(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in1 ^ in4; +        tmp0 = in1 ^ in7; +        out3 = tmp0 ^ in3; +        tmp1 = out3 ^ in5; +        out4 = tmp1 ^ in4; +        tmp2 = tmp1 ^ in0; +        out0 = tmp2 ^ in2; +        out6 = tmp2 ^ in6; +        tmp3 = out0 ^ out4 ^ in6; +        out5 = tmp3 ^ in5; +        out7 = tmp0 ^ tmp3; +        out1 = tmp2 ^ out5 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E2(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in1 ^ in2; +        out4 = in1 ^ in5; +        out2 = in2 ^ in4 ^ in7; +        out5 = in0 ^ in2 ^ in6; +        out0 = out3 ^ in3 ^ in5; +        out7 = out3 ^ in0 ^ in4; +        out6 = out2 ^ out7 ^ in3; +        out1 = out5 ^ in3 ^ in4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E3(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in4 ^ in7; +        tmp0 = in1 ^ in3; +        out3 = tmp0 ^ in2; +        tmp1 = out3 ^ in0; +        out0 = tmp1 ^ in5; +        tmp2 = tmp1 ^ in4; +        out1 = tmp2 ^ in6; +        tmp3 = tmp2 ^ in3; +        out7 = tmp3 ^ in7; +        out6 = out1 ^ out2 ^ in2; +        tmp4 = tmp0 ^ out0; +        out5 = tmp4 ^ in6; +        out4 = tmp3 ^ tmp4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E4(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in6; +        tmp0 = in0 ^ in4; +        tmp1 = tmp0 ^ in2 ^ in6; +        out2 = tmp1 ^ in1; +        out7 = out2 ^ in5; +        tmp2 = tmp0 ^ out7; +        out4 = tmp2 ^ in3; +        out0 = out4 ^ in7; +        out6 = tmp1 ^ out0; +        out5 = tmp2 ^ out6; +        out1 = out5 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E5(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in3 ^ in6; +        tmp0 = in0 ^ in1; +        tmp1 = in5 ^ in7; +        out2 = tmp0 ^ in4 ^ in6; +        tmp2 = tmp1 ^ out2; +        out6 = tmp2 ^ in3; +        out7 = tmp2 ^ in2; +        out0 = out6 ^ in2 ^ in4; +        out5 = out6 ^ in1 ^ in2; +        out1 = tmp0 ^ out5 ^ in5; +        out4 = tmp1 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E6(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in2 ^ in6 ^ in7; +        out2 = out3 ^ in0 ^ in4; +        out4 = out3 ^ in1 ^ in5; +        out1 = out2 ^ in3; +        out7 = out2 ^ out4 ^ in2; +        out0 = out4 ^ in3 ^ in7; +        out5 = out1 ^ in4; +        out6 = out0 ^ out2 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E7(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in3; +        out3 = tmp0 ^ in6 ^ in7; +        tmp1 = out3 ^ in0; +        out5 = tmp1 ^ in5; +        tmp2 = tmp1 ^ in4; +        tmp3 = out5 ^ in7; +        out1 = tmp2 ^ in1; +        out0 = tmp3 ^ in1; +        out6 = out1 ^ in2; +        out2 = tmp0 ^ tmp2; +        tmp4 = tmp3 ^ out6; +        out4 = tmp4 ^ in6; +        out7 = tmp4 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E8(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in3 ^ in6; +        tmp0 = in4 ^ in7; +        out1 = in2 ^ in3 ^ in4; +        out5 = tmp0 ^ in0; +        tmp1 = tmp0 ^ in1; +        tmp2 = tmp1 ^ in5; +        out0 = tmp1 ^ out1; +        out2 = tmp2 ^ in2; +        out6 = tmp2 ^ out5; +        tmp3 = out6 ^ in6; +        out3 = tmp3 ^ in7; +        out7 = tmp3 ^ in2 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_E9(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = in3 ^ in6; +        tmp2 = tmp0 ^ in6; +        out4 = tmp1 ^ in4; +        out6 = tmp2 ^ in5; +        out7 = tmp2 ^ in2 ^ in7; +        out3 = out6 ^ in3 ^ in7; +        out0 = tmp1 ^ out7; +        out2 = out3 ^ out4 ^ in0; +        out5 = tmp0 ^ out2; +        out1 = out0 ^ out5 ^ in5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_EA(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in6 ^ in7; +        out5 = in0 ^ in7; +        out6 = in0 ^ in1; +        out0 = in1 ^ in2 ^ in3; +        out2 = in2 ^ in4 ^ in5; +        out7 = out6 ^ in2; +        out1 = out0 ^ out6 ^ in4; +        out3 = out7 ^ in5 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_EB(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in4 ^ in5; +        tmp0 = in0 ^ in1; +        out4 = in4 ^ in6 ^ in7; +        out5 = in0 ^ in5 ^ in7; +        out6 = tmp0 ^ in6; +        tmp1 = tmp0 ^ in2; +        out0 = tmp1 ^ in3; +        out7 = tmp1 ^ in7; +        out1 = out0 ^ in4; +        out3 = out0 ^ in5 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_EC(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out3 = in0 ^ in5; +        out4 = in2 ^ in3 ^ in7; +        out5 = in0 ^ in3 ^ in4; +        out6 = out3 ^ in1 ^ in4; +        out1 = out4 ^ in4; +        out0 = out4 ^ in1 ^ in6; +        out2 = out0 ^ out5 ^ in5; +        out7 = out2 ^ in4 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_ED(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in2 ^ in4; +        tmp1 = in3 ^ in5; +        out4 = tmp0 ^ in3 ^ in7; +        out3 = tmp1 ^ in0; +        out1 = out4 ^ in1; +        out5 = out3 ^ in4; +        out7 = out1 ^ out5 ^ in6; +        out2 = tmp0 ^ out7; +        out0 = tmp1 ^ out7; +        out6 = out2 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_EE(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in2; +        tmp0 = in0 ^ in1; +        out5 = in0 ^ in3; +        tmp1 = tmp0 ^ in2; +        out6 = tmp0 ^ in4; +        tmp2 = tmp1 ^ out5; +        out7 = tmp1 ^ in5; +        out1 = tmp2 ^ out6 ^ in7; +        out0 = tmp2 ^ in6; +        tmp3 = out7 ^ in1; +        out3 = tmp3 ^ in7; +        out2 = tmp3 ^ in4 ^ in6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_EF(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out4 = in2 ^ in4; +        tmp0 = in0 ^ in5; +        tmp1 = in4 ^ in6; +        out5 = tmp0 ^ in3; +        out2 = tmp0 ^ tmp1; +        out6 = tmp1 ^ in0 ^ in1; +        out3 = out5 ^ in2 ^ in7; +        out7 = out3 ^ in1 ^ in3; +        out0 = out4 ^ out6 ^ in3; +        out1 = tmp1 ^ out0 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F0(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = in4 ^ in5; +        out2 = tmp0 ^ in6; +        out3 = tmp1 ^ in1; +        tmp2 = tmp1 ^ in7; +        out1 = out2 ^ out3 ^ in3; +        tmp3 = tmp0 ^ tmp2; +        out0 = tmp3 ^ in3; +        out5 = tmp3 ^ in0; +        out4 = out1 ^ out5 ^ in4; +        out7 = out4 ^ in2; +        out6 = tmp2 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F1(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in1 ^ in6; +        tmp0 = in3 ^ in5; +        out3 = tmp0 ^ in1 ^ in4; +        tmp1 = out3 ^ in2; +        out1 = tmp1 ^ in6; +        tmp2 = tmp1 ^ in0; +        tmp3 = out1 ^ in5; +        out0 = tmp2 ^ in7; +        out6 = tmp2 ^ in4; +        out7 = tmp3 ^ in0; +        out5 = tmp0 ^ out0; +        out4 = tmp3 ^ out5 ^ in1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F2(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in4 ^ in5; +        out2 = in2 ^ in6 ^ in7; +        tmp1 = tmp0 ^ in1; +        tmp2 = tmp1 ^ in2; +        out0 = tmp2 ^ in3; +        out3 = tmp2 ^ in7; +        out5 = out3 ^ in0 ^ in4; +        tmp3 = tmp0 ^ out5; +        out7 = tmp3 ^ in3; +        out4 = tmp3 ^ out2; +        out1 = out0 ^ out4 ^ in4; +        out6 = tmp1 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F3(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in6 ^ in7; +        tmp0 = in0 ^ in1; +        out4 = tmp0 ^ in6; +        tmp1 = tmp0 ^ in2; +        out5 = tmp1 ^ in7; +        out6 = tmp1 ^ in3; +        out7 = out6 ^ in4; +        out0 = out7 ^ in5; +        out1 = out0 ^ in6; +        out3 = out0 ^ in0 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F4(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0 ^ in1 ^ in2; +        tmp0 = out2 ^ in3; +        out4 = tmp0 ^ in4; +        out5 = out4 ^ in5; +        out6 = out5 ^ in6; +        out7 = out6 ^ in7; +        out0 = out7 ^ in0; +        out1 = out0 ^ in1; +        out3 = tmp0 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F5(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0 ^ in1; +        tmp0 = out2 ^ in2; +        out4 = tmp0 ^ in3; +        out5 = out4 ^ in4; +        out6 = out5 ^ in5; +        out7 = out6 ^ in6; +        out0 = out7 ^ in7; +        out1 = out0 ^ in0; +        out3 = tmp0 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F6(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in7; +        out2 = tmp0 ^ in2; +        out4 = out2 ^ in1 ^ in4; +        out7 = out4 ^ in3 ^ in5; +        out5 = out7 ^ in4 ^ in7; +        out0 = tmp0 ^ out7 ^ in6; +        tmp1 = out0 ^ in1; +        out6 = out0 ^ in0 ^ in5; +        out3 = tmp1 ^ in3; +        out1 = tmp0 ^ tmp1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F7(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0 ^ in7; +        tmp0 = out2 ^ in1; +        out4 = tmp0 ^ in2; +        out5 = out4 ^ in3 ^ in7; +        out6 = out5 ^ in4; +        out7 = out6 ^ in5; +        out0 = out7 ^ in6; +        out1 = out0 ^ in7; +        out3 = tmp0 ^ out1; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F8(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in4; +        tmp1 = in3 ^ in5; +        tmp2 = tmp0 ^ in6; +        out4 = tmp0 ^ tmp1; +        out1 = tmp1 ^ in2 ^ in4; +        out3 = tmp2 ^ in1; +        out5 = out3 ^ in5; +        out7 = out1 ^ out5 ^ in7; +        out6 = tmp1 ^ out7; +        out0 = tmp2 ^ out7; +        out2 = out6 ^ in0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_F9(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in3 ^ in5; +        tmp1 = in0 ^ in6; +        out4 = tmp0 ^ in0; +        tmp2 = tmp1 ^ in4; +        tmp3 = tmp1 ^ in2; +        out5 = tmp2 ^ in1; +        out3 = out5 ^ in3; +        tmp4 = tmp3 ^ out3; +        out1 = tmp4 ^ in5; +        out0 = tmp4 ^ in0 ^ in7; +        out6 = tmp0 ^ out0 ^ in4; +        out7 = tmp2 ^ tmp4; +        out2 = tmp3 ^ out6; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_FA(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in1; +        tmp1 = tmp0 ^ in2; +        tmp2 = tmp0 ^ in5; +        tmp3 = tmp1 ^ in7; +        out5 = tmp2 ^ in6; +        out6 = tmp3 ^ in6; +        out7 = tmp3 ^ in3; +        out3 = out6 ^ in4; +        out2 = tmp1 ^ out5; +        out4 = out2 ^ out3 ^ in1; +        out0 = out4 ^ out7 ^ in5; +        out1 = tmp2 ^ out0; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_FB(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in5 ^ in6; +        tmp0 = in0 ^ in1; +        out4 = in0 ^ in5 ^ in7; +        out5 = tmp0 ^ in6; +        tmp1 = tmp0 ^ in2; +        out6 = tmp1 ^ in7; +        out7 = tmp1 ^ in3; +        out0 = out7 ^ in4; +        out1 = out0 ^ in5; +        out3 = out0 ^ in6 ^ in7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_FC(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in1 ^ in2; +        tmp1 = in0 ^ in7; +        out2 = tmp0 ^ tmp1 ^ in5; +        out3 = tmp1 ^ in4; +        tmp2 = out2 ^ in6; +        out6 = tmp2 ^ in4; +        out7 = tmp2 ^ in3; +        out4 = out6 ^ in1 ^ in3; +        tmp3 = out4 ^ in0; +        out1 = tmp3 ^ in6; +        out0 = tmp3 ^ in1 ^ in5; +        out5 = tmp0 ^ out4; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_FD(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in5; +        tmp1 = in1 ^ in7; +        out2 = tmp0 ^ tmp1; +        out6 = out2 ^ in2 ^ in4; +        tmp2 = out6 ^ in0; +        out1 = tmp2 ^ in3; +        out0 = tmp0 ^ out1 ^ in6; +        out5 = out0 ^ in2; +        tmp3 = out5 ^ in1; +        out3 = tmp3 ^ in6; +        out7 = tmp2 ^ tmp3; +        out4 = tmp1 ^ out7; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_FE(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        tmp0 = in0 ^ in2; +        out2 = tmp0 ^ in5; +        out3 = tmp0 ^ in4; +        tmp1 = out3 ^ in6; +        out4 = tmp1 ^ in5; +        tmp2 = tmp1 ^ in1; +        out6 = tmp2 ^ in7; +        tmp3 = tmp2 ^ in0; +        out0 = tmp3 ^ in3; +        tmp4 = out0 ^ out4 ^ in7; +        out5 = tmp4 ^ in6; +        out7 = tmp4 ^ in2; +        out1 = tmp3 ^ out5; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void gf8_muladd_FF(void *out, void *in) +{ +    unsigned int i; +    uint64_t *in_ptr = (uint64_t *)in; +    uint64_t *out_ptr = (uint64_t *)out; + +    for (i = 0; i < WIDTH; i++) { +        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; +        uint64_t tmp0, tmp1, tmp2, tmp3; + +        uint64_t in0 = out_ptr[0]; +        uint64_t in1 = out_ptr[WIDTH]; +        uint64_t in2 = out_ptr[WIDTH * 2]; +        uint64_t in3 = out_ptr[WIDTH * 3]; +        uint64_t in4 = out_ptr[WIDTH * 4]; +        uint64_t in5 = out_ptr[WIDTH * 5]; +        uint64_t in6 = out_ptr[WIDTH * 6]; +        uint64_t in7 = out_ptr[WIDTH * 7]; + +        out2 = in0 ^ in5; +        tmp0 = in4 ^ in7; +        tmp1 = out2 ^ in2; +        out4 = tmp1 ^ in6; +        out7 = tmp1 ^ in1 ^ in3; +        out1 = tmp0 ^ out7; +        tmp2 = out1 ^ in5; +        out6 = tmp2 ^ in3; +        tmp3 = tmp2 ^ in7; +        out0 = tmp3 ^ in6; +        out3 = tmp3 ^ in1; +        out5 = tmp0 ^ out0 ^ in2; + +        out_ptr[0] = out0 ^ in_ptr[0]; +        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH]; +        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2]; +        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3]; +        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4]; +        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5]; +        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6]; +        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7]; + +        in_ptr++; +        out_ptr++; +    } +} + +static void (*gf8_muladd[])(void *out, void *in) = { +    gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, +    gf8_muladd_04, gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, +    gf8_muladd_08, gf8_muladd_09, gf8_muladd_0A, gf8_muladd_0B, +    gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E, gf8_muladd_0F, +    gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13, +    gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, +    gf8_muladd_18, gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, +    gf8_muladd_1C, gf8_muladd_1D, gf8_muladd_1E, gf8_muladd_1F, +    gf8_muladd_20, gf8_muladd_21, gf8_muladd_22, gf8_muladd_23, +    gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27, +    gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, +    gf8_muladd_2C, gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, +    gf8_muladd_30, gf8_muladd_31, gf8_muladd_32, gf8_muladd_33, +    gf8_muladd_34, gf8_muladd_35, gf8_muladd_36, gf8_muladd_37, +    gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B, +    gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, +    gf8_muladd_40, gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, +    gf8_muladd_44, gf8_muladd_45, gf8_muladd_46, gf8_muladd_47, +    gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A, gf8_muladd_4B, +    gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F, +    gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, +    gf8_muladd_54, gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, +    gf8_muladd_58, gf8_muladd_59, gf8_muladd_5A, gf8_muladd_5B, +    gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E, gf8_muladd_5F, +    gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63, +    gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, +    gf8_muladd_68, gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, +    gf8_muladd_6C, gf8_muladd_6D, gf8_muladd_6E, gf8_muladd_6F, +    gf8_muladd_70, gf8_muladd_71, gf8_muladd_72, gf8_muladd_73, +    gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77, +    gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, +    gf8_muladd_7C, gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, +    gf8_muladd_80, gf8_muladd_81, gf8_muladd_82, gf8_muladd_83, +    gf8_muladd_84, gf8_muladd_85, gf8_muladd_86, gf8_muladd_87, +    gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B, +    gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, +    gf8_muladd_90, gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, +    gf8_muladd_94, gf8_muladd_95, gf8_muladd_96, gf8_muladd_97, +    gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A, gf8_muladd_9B, +    gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F, +    gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, +    gf8_muladd_A4, gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, +    gf8_muladd_A8, gf8_muladd_A9, gf8_muladd_AA, gf8_muladd_AB, +    gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE, gf8_muladd_AF, +    gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3, +    gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, +    gf8_muladd_B8, gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, +    gf8_muladd_BC, gf8_muladd_BD, gf8_muladd_BE, gf8_muladd_BF, +    gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2, gf8_muladd_C3, +    gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7, +    gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, +    gf8_muladd_CC, gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, +    gf8_muladd_D0, gf8_muladd_D1, gf8_muladd_D2, gf8_muladd_D3, +    gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6, gf8_muladd_D7, +    gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB, +    gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, +    gf8_muladd_E0, gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, +    gf8_muladd_E4, gf8_muladd_E5, gf8_muladd_E6, gf8_muladd_E7, +    gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA, gf8_muladd_EB, +    gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF, +    gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, +    gf8_muladd_F4, gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, +    gf8_muladd_F8, gf8_muladd_F9, gf8_muladd_FA, gf8_muladd_FB, +    gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE, gf8_muladd_FF +}; + +static uint64_t zero[EC_METHOD_WORD_SIZE * 8] = {0, }; + +void ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count) +{ +    uint32_t i, last, tmp; + +    last = 1; +    for (i = count; i > 0; i--) { +        if (values[i - 1] != 0) { +            tmp = values[i - 1]; +            values[i - 1] = ec_gf_div(gf, tmp, last); +            last = tmp; +        } +    } +} + +void ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values, +                      uint32_t count) +{ +    src += offset; +    gf8_muladd_00(dst, src); +    while (--count > 0) { +        src += EC_METHOD_CHUNK_SIZE; +        gf8_muladd[*values](dst, src); +        values++; +    } +} + +void ec_code_c_interleaved(void *dst, void **src, uint64_t offset, +                           uint32_t *values, uint32_t count) +{ +    uint32_t i, last, tmp; + +    i = 0; +    while ((last = *values++) == 0) { +        i++; +    } +    gf8_muladd_00(dst, src[i++] + offset); +    while (i < count) { +        tmp = *values++; +        if (tmp != 0) { +            gf8_muladd[last](dst, src[i] + offset); +            last = tmp; +        } +        i++; +    } +    gf8_muladd[last](dst, zero); +} diff --git a/xlators/cluster/ec/src/ec-code-c.h b/xlators/cluster/ec/src/ec-code-c.h new file mode 100644 index 00000000000..92e8070e514 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-c.h @@ -0,0 +1,24 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_C_H__ +#define __EC_CODE_C_H__ + +#include "ec-types.h" + +void ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count); + +void ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values, +                      uint32_t count); + +void ec_code_c_interleaved(void *dst, void **src, uint64_t offset, +                           uint32_t *values, uint32_t count); + +#endif /* __EC_CODE_C_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-intel.c b/xlators/cluster/ec/src/ec-code-intel.c new file mode 100644 index 00000000000..b9fdcad4421 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-intel.c @@ -0,0 +1,600 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <inttypes.h> +#include <string.h> +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_intel_init(ec_code_intel_t *intel) +{ +    memset(intel, 0, sizeof(ec_code_intel_t)); +} + +static void +ec_code_intel_prefix(ec_code_intel_t *intel, uint8_t prefix) +{ +    intel->prefix.data[intel->prefix.bytes++] = prefix; +} + +static void +ec_code_intel_rex(ec_code_intel_t *intel, gf_boolean_t w) +{ +    gf_boolean_t present = _gf_false; + +    if (w) { +        intel->rex.w = 1; +        present = _gf_true; +    } +    if (intel->modrm.present) { +        if (intel->modrm.reg > 7) { +            intel->modrm.reg &= 7; +            intel->rex.r = 1; +            present = _gf_true; +        } +        if (intel->sib.present) { +            if (intel->sib.index > 7) { +                intel->sib.index &= 7; +                intel->rex.x = 1; +                present = _gf_true; +            } +            if (intel->sib.base > 7) { +                intel->sib.base &= 7; +                intel->rex.b = 1; +                present = _gf_true; +            } +        } else if (intel->modrm.rm > 7) { +            intel->modrm.rm &= 7; +            intel->rex.b = 1; +            present = _gf_true; +        } +    } else if (intel->reg > 7) { +        intel->reg &= 7; +        intel->rex.b = 1; +        present = _gf_true; +    } +    intel->rex.present = present; +} + +static void +ec_code_intel_vex(ec_code_intel_t *intel, gf_boolean_t w, gf_boolean_t l, +                  ec_code_vex_opcode_t opcode, ec_code_vex_prefix_t prefix, +                  uint32_t reg) +{ +    ec_code_intel_rex(intel, w); +    if (((intel->rex.w == 1) || +         (intel->rex.x == 0) || +         (intel->rex.b == 0)) || +        ((opcode != VEX_OPCODE_NONE) && (opcode != VEX_OPCODE_0F))) { +        intel->rex.present = _gf_false; + +        intel->vex.bytes = 3; +        intel->vex.data[0] = 0xC4; +        intel->vex.data[1] = ((intel->rex.r << 7) | (intel->rex.x << 6) | +                              (intel->rex.b << 5) | opcode) ^ 0xE0; +        intel->vex.data[2] = (intel->rex.w << 7) | ((~reg & 0x0F) << 3) | +                             (l ? 0x04 : 0x00) | prefix; +    } else { +        intel->vex.bytes = 2; +        intel->vex.data[0] = 0xC5; +        intel->vex.data[1] = (intel->rex.r << 7) | ((~reg & 0x0F) << 3) | +                             (l ? 0x04 : 0x00) | prefix; +    } +} + +static void +ec_code_intel_modrm_reg(ec_code_intel_t *intel, uint32_t rm, uint32_t reg) +{ +    intel->modrm.present = _gf_true; +    intel->modrm.mod = 3; +    intel->modrm.rm = rm; +    intel->modrm.reg = reg; +} + +static void +ec_code_intel_modrm_mem(ec_code_intel_t *intel, uint32_t reg, +                        ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                        uint32_t scale, int32_t offset) +{ +    if (index == REG_SP) { +        intel->invalid = _gf_true; +        return; +    } +    if ((index != REG_NULL) && (scale != 1) && (scale != 2) && (scale != 4) && +        (scale != 8)) { +        intel->invalid = _gf_true; +        return; +    } +    scale >>= 1; +    if (scale == 4) { +        scale = 3; +    } + +    intel->modrm.present = _gf_true; +    intel->modrm.reg = reg; + +    intel->offset.value = offset; +    if ((offset == 0) && (base != REG_BP)) { +        intel->modrm.mod = 0; +        intel->offset.bytes = 0; +    } else if ((offset >= -128) && (offset <= 127)) { +        intel->modrm.mod = 1; +        intel->offset.bytes = 1; +    } else { +        intel->modrm.mod = 2; +        intel->offset.bytes = 4; +    } + +    intel->modrm.rm = base; +    if ((index != REG_NULL) || (base == REG_SP)) { +        intel->modrm.rm = 4; +        intel->sib.present = _gf_true; +        intel->sib.index = index; +        if (index == REG_NULL) { +            intel->sib.index = 4; +        } +        intel->sib.scale = scale; +        intel->sib.base = base; +        if (base == REG_NULL) { +            intel->sib.base = 5; +            intel->modrm.mod = 0; +            intel->offset.bytes = 4; +        } +    } else if (base == REG_NULL) { +        intel->modrm.mod = 0; +        intel->modrm.rm = 5; +        intel->offset.bytes = 4; +    } +} + +static void +ec_code_intel_op_1(ec_code_intel_t *intel, uint8_t opcode, uint32_t reg) +{ +    intel->reg = reg; +    intel->opcode.bytes = 1; +    intel->opcode.data[0] = opcode; +} + +static void +ec_code_intel_op_2(ec_code_intel_t *intel, uint8_t opcode1, uint8_t opcode2, +                   uint32_t reg) +{ +    intel->reg = reg; +    intel->opcode.bytes = 2; +    intel->opcode.data[0] = opcode1; +    intel->opcode.data[1] = opcode2; +} + +static void +ec_code_intel_immediate_1(ec_code_intel_t *intel, uint32_t value) +{ +    intel->immediate.bytes = 1; +    intel->immediate.value = value; +} + +static void +ec_code_intel_immediate_2(ec_code_intel_t *intel, uint32_t value) +{ +    intel->immediate.bytes = 2; +    intel->immediate.value = value; +} + +static void +ec_code_intel_immediate_4(ec_code_intel_t *intel, uint32_t value) +{ +    intel->immediate.bytes = 4; +    intel->immediate.value = value; +} + +static void +ec_code_intel_emit(ec_code_builder_t *builder, ec_code_intel_t *intel) +{ +    uint8_t insn[15]; +    uint32_t i, count; + +    if (intel->invalid) { +        ec_code_error(builder, EINVAL); +        return; +    } + +    count = 0; +    for (i = 0; i < intel->prefix.bytes; i++) { +        insn[count++] = intel->prefix.data[i]; +    } +    for (i = 0; i < intel->vex.bytes; i++) { +        insn[count++] = intel->vex.data[i]; +    } +    if (intel->rex.present) { +        insn[count++] = 0x40 | +                        (intel->rex.w << 3) | +                        (intel->rex.r << 2) | +                        (intel->rex.x << 1) | +                        (intel->rex.b << 0); +    } +    for (i = 0; i < intel->opcode.bytes; i++) { +        insn[count++] = intel->opcode.data[i]; +    } +    if (intel->modrm.present) { +        insn[count++] = (intel->modrm.mod << 6) | +                        (intel->modrm.reg << 3) | +                        (intel->modrm.rm << 0); +        if (intel->sib.present) { +            insn[count++] = (intel->sib.scale << 6) | +                            (intel->sib.index << 3) | +                            (intel->sib.base << 0); +        } +    } +    for (i = 0; i < intel->offset.bytes; i++) { +        insn[count++] = intel->offset.data[i]; +    } +    for (i = 0; i < intel->immediate.bytes; i++) { +        insn[count++] = intel->immediate.data[i]; +    } + +    ec_code_emit(builder, insn, count); +} + +void +ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_op_1(&intel, 0x50 | (reg & 7), reg); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_op_1(&intel, 0x58 | (reg & 7), reg); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    if (size == 0) { +        ec_code_intel_op_1(&intel, 0xC3, 0); +    } else { +        ec_code_intel_immediate_2(&intel, size); +        ec_code_intel_op_1(&intel, 0xC2, 0); +    } +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, +                         ec_code_intel_reg_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_reg(&intel, dst, src); +    ec_code_intel_op_1(&intel, 0x89, 0); +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src, +                         ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                         uint32_t scale, int32_t offset) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); +    ec_code_intel_op_1(&intel, 0x89, 0); +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, +                         ec_code_intel_reg_t index, uint32_t scale, +                         int32_t offset, ec_code_intel_reg_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); +    ec_code_intel_op_1(&intel, 0x8B, 0); +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src, +                         ec_code_intel_reg_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_reg(&intel, dst, src); +    ec_code_intel_op_1(&intel, 0x31, 0); +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base, +                         ec_code_intel_reg_t index, uint32_t scale, +                         int32_t offset, ec_code_intel_reg_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); +    ec_code_intel_op_1(&intel, 0x33, 0); +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value, +                         ec_code_intel_reg_t reg) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    if ((value >= -128) && (value < 128)) { +        ec_code_intel_modrm_reg(&intel, reg, 0); +        ec_code_intel_op_1(&intel, 0x83, 0); +        ec_code_intel_immediate_1(&intel, value); +    } else { +        if (reg == REG_AX) { +            ec_code_intel_op_1(&intel, 0x05, reg); +        } else { +            ec_code_intel_modrm_reg(&intel, reg, 0); +            ec_code_intel_op_1(&intel, 0x81, 0); +        } +        ec_code_intel_immediate_4(&intel, value); +    } +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value, +                          ec_code_intel_reg_t reg) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    if (reg == REG_AX) { +        ec_code_intel_op_1(&intel, 0xA9, reg); +    } else { +        ec_code_intel_modrm_reg(&intel, reg, 0); +        ec_code_intel_op_1(&intel, 0xF7, 0); +    } +    ec_code_intel_immediate_4(&intel, value); +    ec_code_intel_rex(&intel, _gf_true); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address) +{ +    ec_code_intel_t intel; +    int32_t rel; + +    ec_code_intel_init(&intel); + +    rel = address - builder->address - 2; +    if ((rel >= -128) && (rel < 128)) { +        ec_code_intel_op_1(&intel, 0x75, 0); +        ec_code_intel_immediate_1(&intel, rel); +    } else { +        rel -= 4; +        ec_code_intel_op_2(&intel, 0x0F, 0x85, 0); +        ec_code_intel_immediate_4(&intel, rel); +    } +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src, +                             uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_prefix(&intel, 0x66); +    ec_code_intel_modrm_reg(&intel, src, dst); +    ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src, +                           ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                           uint32_t scale, int32_t offset) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_prefix(&intel, 0x66); +    ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); +    ec_code_intel_op_2(&intel, 0x0F, 0x7F, 0); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, +                           ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                           uint32_t scale, int32_t offset, uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_prefix(&intel, 0x66); +    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); +    ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src, +                             uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_prefix(&intel, 0x66); +    ec_code_intel_modrm_reg(&intel, src, dst); +    ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, +                           ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                           uint32_t scale, int32_t offset, uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_prefix(&intel, 0x66); +    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); +    ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0); +    ec_code_intel_rex(&intel, _gf_false); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src, +                             uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_reg(&intel, src, dst); +    ec_code_intel_op_1(&intel, 0x6F, 0); +    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, +                      VEX_PREFIX_66, VEX_REG_NONE); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src, +                           ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                           uint32_t scale, int32_t offset) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset); +    ec_code_intel_op_1(&intel, 0x7F, 0); +    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, +                      VEX_PREFIX_66, VEX_REG_NONE); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, +                           ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                           uint32_t scale, int32_t offset, uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); +    ec_code_intel_op_1(&intel, 0x6F, 0); +    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, +                      VEX_PREFIX_66, VEX_REG_NONE); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src, +                             uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_reg(&intel, src, dst); +    ec_code_intel_op_1(&intel, 0xEF, 0); +    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, +                      VEX_PREFIX_66, dst); + +    ec_code_intel_emit(builder, &intel); +} + +void +ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, +                           ec_code_intel_reg_t base, ec_code_intel_reg_t index, +                           uint32_t scale, int32_t offset, uint32_t dst) +{ +    ec_code_intel_t intel; + +    ec_code_intel_init(&intel); + +    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset); +    ec_code_intel_op_1(&intel, 0xEF, 0); +    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, +                      VEX_PREFIX_66, dst); + +    ec_code_intel_emit(builder, &intel); +} diff --git a/xlators/cluster/ec/src/ec-code-intel.h b/xlators/cluster/ec/src/ec-code-intel.h new file mode 100644 index 00000000000..903d023f962 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-intel.h @@ -0,0 +1,184 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_INTEL_H__ +#define __EC_CODE_INTEL_H__ + +#include "ec-code.h" + +#define VEX_REG_NONE 0 + +enum _ec_code_intel_reg; +typedef enum _ec_code_intel_reg ec_code_intel_reg_t; + +enum _ec_code_vex_prefix; +typedef enum _ec_code_vex_prefix ec_code_vex_prefix_t; + +enum _ec_code_vex_opcode; +typedef enum _ec_code_vex_opcode ec_code_vex_opcode_t; + +struct _ec_code_intel_buffer; +typedef struct _ec_code_intel_buffer ec_code_intel_buffer_t; + +struct _ec_code_intel_sib; +typedef struct _ec_code_intel_sib ec_code_intel_sib_t; + +struct _ec_code_intel_modrm; +typedef struct _ec_code_intel_modrm ec_code_intel_modrm_t; + +struct _ec_code_intel_rex; +typedef struct _ec_code_intel_rex ec_code_intel_rex_t; + +struct _ec_code_intel; +typedef struct _ec_code_intel ec_code_intel_t; + +enum _ec_code_intel_reg { +    REG_NULL = -1, +    REG_AX, +    REG_CX, +    REG_DX, +    REG_BX, +    REG_SP, +    REG_BP, +    REG_SI, +    REG_DI, +    REG_8, +    REG_9, +    REG_10, +    REG_11, +    REG_12, +    REG_13, +    REG_14, +    REG_15 +}; + +enum _ec_code_vex_prefix { +    VEX_PREFIX_NONE = 0, +    VEX_PREFIX_66, +    VEX_PREFIX_F3, +    VEX_PREFIX_F2 +}; + +enum _ec_code_vex_opcode { +    VEX_OPCODE_NONE = 0, +    VEX_OPCODE_0F, +    VEX_OPCODE_0F_38, +    VEX_OPCODE_0F_3A +}; + +struct _ec_code_intel_buffer { +    uint32_t bytes; +    union { +        uint8_t  data[4]; +        uint32_t value; +    }; +}; + +struct _ec_code_intel_sib { +    gf_boolean_t present; +    uint32_t     base; +    uint32_t     index; +    uint32_t     scale; +}; + +struct _ec_code_intel_modrm { +    gf_boolean_t present; +    uint32_t     mod; +    uint32_t     rm; +    uint32_t     reg; +}; + +struct _ec_code_intel_rex { +    gf_boolean_t present; +    uint32_t     w; +    uint32_t     r; +    uint32_t     x; +    uint32_t     b; +}; + +struct _ec_code_intel { +    gf_boolean_t           invalid; +    ec_code_intel_buffer_t prefix; +    ec_code_intel_buffer_t opcode; +    ec_code_intel_buffer_t offset; +    ec_code_intel_buffer_t immediate; +    ec_code_intel_buffer_t vex; +    ec_code_intel_rex_t    rex; +    ec_code_intel_modrm_t  modrm; +    ec_code_intel_sib_t    sib; +    uint32_t               reg; +}; + +void ec_code_intel_op_push_r(ec_code_builder_t *builder, +                             ec_code_intel_reg_t reg); +void ec_code_intel_op_pop_r(ec_code_builder_t *builder, +                            ec_code_intel_reg_t reg); +void ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size); + +void ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, +                              ec_code_intel_reg_t src, +                              ec_code_intel_reg_t dst); +void ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, +                              ec_code_intel_reg_t src, +                              ec_code_intel_reg_t base, +                              ec_code_intel_reg_t index, uint32_t scale, +                              int32_t offset); +void ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, +                              ec_code_intel_reg_t base, +                              ec_code_intel_reg_t index, uint32_t scale, +                              int32_t offset, ec_code_intel_reg_t dst); +void ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, +                              ec_code_intel_reg_t src, +                              ec_code_intel_reg_t dst); +void ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, +                              ec_code_intel_reg_t base, +                              ec_code_intel_reg_t index, uint32_t scale, +                              int32_t offset, ec_code_intel_reg_t dst); +void ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value, +                              ec_code_intel_reg_t reg); +void ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value, +                               ec_code_intel_reg_t reg); +void ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address); + +void ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src, +                                  uint32_t dst); +void ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src, +                                ec_code_intel_reg_t base, +                                ec_code_intel_reg_t index, uint32_t scale, +                                int32_t offset); +void ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, +                                ec_code_intel_reg_t base, +                                ec_code_intel_reg_t index, uint32_t scale, +                                int32_t offset, uint32_t dst); +void ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src, +                                  uint32_t dst); +void ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, +                                ec_code_intel_reg_t base, +                                ec_code_intel_reg_t index, uint32_t scale, +                                int32_t offset, uint32_t dst); + +void ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src, +                                  uint32_t dst); +void ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src, +                                ec_code_intel_reg_t base, +                                ec_code_intel_reg_t index, uint32_t scale, +                                int32_t offset); +void ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, +                                ec_code_intel_reg_t base, +                                ec_code_intel_reg_t index, uint32_t scale, +                                int32_t offset, uint32_t dst); +void ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src, +                                  uint32_t dst); +void ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, +                                ec_code_intel_reg_t base, +                                ec_code_intel_reg_t index, uint32_t scale, +                                int32_t offset, uint32_t dst); + +#endif /* __EC_CODE_INTEL_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-sse.c b/xlators/cluster/ec/src/ec-code-sse.c new file mode 100644 index 00000000000..6f2c6fa593f --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-sse.c @@ -0,0 +1,108 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static void +ec_code_sse_prolog(ec_code_builder_t *builder) +{ +    builder->loop = builder->address; +} + +static void +ec_code_sse_epilog(ec_code_builder_t *builder) +{ +    ec_code_intel_op_add_i2r(builder, 16, REG_DX); +    ec_code_intel_op_add_i2r(builder, 16, REG_DI); +    ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); +    ec_code_intel_op_jne(builder, builder->loop); + +    ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_sse_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, +                 uint32_t bit) +{ +    if (builder->linear) { +        ec_code_intel_op_mov_m2sse(builder, REG_SI, REG_DX, 1, +                                   idx * builder->width * builder->bits + +                                   bit * builder->width, +                                   dst); +    } else { +        if (builder->base != idx) { +            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, +                                     REG_AX); +            builder->base = idx; +        } +        ec_code_intel_op_mov_m2sse(builder, REG_AX, REG_DX, 1, +                                   bit * builder->width, dst); +    } +} + +static void +ec_code_sse_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ +    ec_code_intel_op_mov_sse2m(builder, src, REG_DI, REG_NULL, 0, +                               bit * builder->width); +} + +static void +ec_code_sse_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    ec_code_intel_op_mov_sse2sse(builder, src, dst); +} + +static void +ec_code_sse_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    ec_code_intel_op_xor_sse2sse(builder, src, dst); +} + +static void +ec_code_sse_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, +                 uint32_t bit) +{ +    if (builder->linear) { +       ec_code_intel_op_xor_m2sse(builder, REG_SI, REG_DX, 1, +                                  idx * builder->width * builder->bits + +                                  bit * builder->width, +                                  dst); +    } else { +        if (builder->base != idx) { +            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, +                                     REG_AX); +            builder->base = idx; +        } +        ec_code_intel_op_xor_m2sse(builder, REG_AX, REG_DX, 1, +                                   bit * builder->width, dst); +    } +} + +static char *ec_code_sse_needed_flags[] = { +    "sse2", +    NULL +}; + +ec_code_gen_t ec_code_gen_sse = { +    .name   = "sse", +    .flags  = ec_code_sse_needed_flags, +    .width  = 16, +    .prolog = ec_code_sse_prolog, +    .epilog = ec_code_sse_epilog, +    .load   = ec_code_sse_load, +    .store  = ec_code_sse_store, +    .copy   = ec_code_sse_copy, +    .xor2   = ec_code_sse_xor2, +    .xor3   = NULL, +    .xorm   = ec_code_sse_xorm +}; diff --git a/xlators/cluster/ec/src/ec-code-sse.h b/xlators/cluster/ec/src/ec-code-sse.h new file mode 100644 index 00000000000..f1acbcf894b --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-sse.h @@ -0,0 +1,18 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_SSE_H__ +#define __EC_CODE_SSE_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_sse; + +#endif /* __EC_CODE_SSE_H__ */ diff --git a/xlators/cluster/ec/src/ec-code-x64.c b/xlators/cluster/ec/src/ec-code-x64.c new file mode 100644 index 00000000000..e94ddd4b155 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-x64.c @@ -0,0 +1,150 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <errno.h> + +#include "ec-code-intel.h" + +static ec_code_intel_reg_t ec_code_x64_regmap[] = { +    REG_AX, REG_CX, REG_BP, REG_8,  REG_9, REG_10, +    REG_11, REG_12, REG_13, REG_14, REG_15 +}; + +static void +ec_code_x64_prolog(ec_code_builder_t *builder) +{ +    uint32_t i; + +    ec_code_intel_op_push_r(builder, REG_BP); +    if (!builder->linear) { +        ec_code_intel_op_push_r(builder, REG_BX); +    } +    if (builder->regs > 11) { +        ec_code_error(builder, EINVAL); +        return; +    } +    for (i = 7; i < builder->regs; i++) { +        ec_code_intel_op_push_r(builder, ec_code_x64_regmap[i]); +    } + +    builder->loop = builder->address; +} + +static void +ec_code_x64_epilog(ec_code_builder_t *builder) +{ +    uint32_t i; + +    ec_code_intel_op_add_i2r(builder, 8, REG_DX); +    ec_code_intel_op_add_i2r(builder, 8, REG_DI); +    ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX); +    ec_code_intel_op_jne(builder, builder->loop); + +    if (builder->regs > 11) { +        ec_code_error(builder, EINVAL); +    } +    for (i = builder->regs; i > 7; i--) { +        ec_code_intel_op_pop_r(builder, ec_code_x64_regmap[i - 1]); +    } +    if (!builder->linear) { +        ec_code_intel_op_pop_r(builder, REG_BX); +    } +    ec_code_intel_op_pop_r(builder, REG_BP); +    ec_code_intel_op_ret(builder, 0); +} + +static void +ec_code_x64_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, +                 uint32_t bit) +{ +    dst = ec_code_x64_regmap[dst]; + +    if (builder->linear) { +        ec_code_intel_op_mov_m2r(builder, REG_SI, REG_DX, 1, +                                 idx * builder->width * builder->bits + +                                 bit * builder->width, +                                 dst); +    } else { +        if (builder->base != idx) { +            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, +                                     REG_BX); +            builder->base = idx; +        } +        ec_code_intel_op_mov_m2r(builder, REG_BX, REG_DX, 1, +                                 bit * builder->width, dst); +    } +} + +static void +ec_code_x64_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit) +{ +    src = ec_code_x64_regmap[src]; + +    ec_code_intel_op_mov_r2m(builder, src, REG_DI, REG_NULL, 0, +                             bit * builder->width); +} + +static void +ec_code_x64_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    dst = ec_code_x64_regmap[dst]; +    src = ec_code_x64_regmap[src]; + +    ec_code_intel_op_mov_r2r(builder, src, dst); +} + +static void +ec_code_x64_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    dst = ec_code_x64_regmap[dst]; +    src = ec_code_x64_regmap[src]; + +    ec_code_intel_op_xor_r2r(builder, src, dst); +} + +static void +ec_code_x64_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx, +                 uint32_t bit) +{ +    dst = ec_code_x64_regmap[dst]; + +    if (builder->linear) { +        ec_code_intel_op_xor_m2r(builder, REG_SI, REG_DX, 1, +                                 idx * builder->width * builder->bits + +                                 bit * builder->width, +                                 dst); +    } else { +        if (builder->base != idx) { +            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8, +                                     REG_BX); +            builder->base = idx; +        } +        ec_code_intel_op_xor_m2r(builder, REG_BX, REG_DX, 1, +                                 bit * builder->width, dst); +    } +} + +static char *ec_code_x64_needed_flags[] = { +    NULL +}; + +ec_code_gen_t ec_code_gen_x64 = { +    .name   = "x64", +    .flags  = ec_code_x64_needed_flags, +    .width  = sizeof(uint64_t), +    .prolog = ec_code_x64_prolog, +    .epilog = ec_code_x64_epilog, +    .load   = ec_code_x64_load, +    .store  = ec_code_x64_store, +    .copy   = ec_code_x64_copy, +    .xor2   = ec_code_x64_xor2, +    .xor3   = NULL, +    .xorm   = ec_code_x64_xorm +}; diff --git a/xlators/cluster/ec/src/ec-code-x64.h b/xlators/cluster/ec/src/ec-code-x64.h new file mode 100644 index 00000000000..bd8174e4bf5 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code-x64.h @@ -0,0 +1,18 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_X64_H__ +#define __EC_CODE_X64_H__ + +#include "ec-code.h" + +extern ec_code_gen_t ec_code_gen_x64; + +#endif /* __EC_CODE_X64_H__ */ diff --git a/xlators/cluster/ec/src/ec-code.c b/xlators/cluster/ec/src/ec-code.c new file mode 100644 index 00000000000..a1f652779f3 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code.c @@ -0,0 +1,904 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include "syscall.h" + +#include "ec-mem-types.h" +#include "ec-code.h" +#include "ec-messages.h" +#include "ec-code-c.h" + +#ifdef USE_EC_DYNAMIC_X64 +#include "ec-code-x64.h" +#endif + +#ifdef USE_EC_DYNAMIC_SSE +#include "ec-code-sse.h" +#endif + +#ifdef USE_EC_DYNAMIC_AVX +#include "ec-code-avx.h" +#endif + +#define EC_PROC_BUFFER_SIZE 4096 + +#define PROC_CPUINFO "/proc/cpuinfo" + +struct _ec_code_proc; +typedef struct _ec_code_proc ec_code_proc_t; + +struct _ec_code_proc { +    int32_t      fd; +    gf_boolean_t eof; +    gf_boolean_t error; +    gf_boolean_t skip; +    ssize_t      size; +    ssize_t      pos; +    char         buffer[EC_PROC_BUFFER_SIZE]; +}; + +static ec_code_gen_t *ec_code_gen_table[] = { +#ifdef USE_EC_DYNAMIC_AVX +    &ec_code_gen_avx, +#endif +#ifdef USE_EC_DYNAMIC_SSE +    &ec_code_gen_sse, +#endif +#ifdef USE_EC_DYNAMIC_X64 +    &ec_code_gen_x64, +#endif +    NULL +}; + +static void +ec_code_arg_set(ec_code_arg_t *arg, uint32_t value) +{ +    arg->value = value; +} + +static void +ec_code_arg_assign(ec_code_builder_t *builder, ec_code_op_t *op, +                   ec_code_arg_t *arg, uint32_t reg) +{ +    arg->value = reg; + +    if (builder->regs <= reg) { +        builder->regs = reg + 1; +    } + +} + +static void +ec_code_arg_use(ec_code_builder_t *builder, ec_code_op_t *op, +                ec_code_arg_t *arg, uint32_t reg) +{ +    arg->value = reg; +} + +static void +ec_code_arg_update(ec_code_builder_t *builder, ec_code_op_t *op, +                   ec_code_arg_t *arg, uint32_t reg) +{ +    arg->value = reg; +} + +static ec_code_op_t * +ec_code_op_next(ec_code_builder_t *builder) +{ +    ec_code_op_t *op; + +    op = &builder->ops[builder->count++]; +    memset(op, 0, sizeof(ec_code_op_t)); + +    return op; +} + +static void +ec_code_load(ec_code_builder_t *builder, uint32_t bit, uint32_t offset) +{ +    ec_code_op_t *op; + +    op = ec_code_op_next(builder); + +    op->op = EC_GF_OP_LOAD; +    ec_code_arg_assign(builder, op, &op->arg1, builder->map[bit]); +    ec_code_arg_set(&op->arg2, offset); +    ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_store(ec_code_builder_t *builder, uint32_t reg, uint32_t bit) +{ +    ec_code_op_t *op; + +    op = ec_code_op_next(builder); + +    op->op = EC_GF_OP_STORE; +    ec_code_arg_use(builder, op, &op->arg1, builder->map[reg]); +    ec_code_arg_set(&op->arg2, 0); +    ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    ec_code_op_t *op; + +    op = ec_code_op_next(builder); + +    op->op = EC_GF_OP_COPY; +    ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]); +    ec_code_arg_use(builder, op, &op->arg2, builder->map[src]); +    ec_code_arg_set(&op->arg3, 0); +} + +static void +ec_code_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src) +{ +    ec_code_op_t *op; + +    op = ec_code_op_next(builder); + +    op->op = EC_GF_OP_XOR2; +    ec_code_arg_update(builder, op, &op->arg1, builder->map[dst]); +    ec_code_arg_use(builder, op, &op->arg2, builder->map[src]); +    ec_code_arg_set(&op->arg3, 0); +} + +static void +ec_code_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, +             uint32_t src2) +{ +    ec_code_op_t *op; + +    if (builder->code->gen->xor3 == NULL) { +        ec_code_copy(builder, dst, src1); +        ec_code_xor2(builder, dst, src2); + +        return; +    } + +    op = ec_code_op_next(builder); + +    op->op = EC_GF_OP_XOR3; +    ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]); +    ec_code_arg_use(builder, op, &op->arg2, builder->map[src1]); +    ec_code_arg_use(builder, op, &op->arg3, builder->map[src2]); +} + +static void +ec_code_xorm(ec_code_builder_t *builder, uint32_t bit, uint32_t offset) +{ +    ec_code_op_t *op; + +    op = ec_code_op_next(builder); + +    op->op = EC_GF_OP_XORM; +    ec_code_arg_update(builder, op, &op->arg1, builder->map[bit]); +    ec_code_arg_set(&op->arg2, offset); +    ec_code_arg_set(&op->arg3, bit); +} + +static void +ec_code_dup(ec_code_builder_t *builder, ec_gf_op_t *op) +{ +    switch (op->op) { +    case EC_GF_OP_COPY: +        ec_code_copy(builder, op->arg1, op->arg2); +        break; +    case EC_GF_OP_XOR2: +        ec_code_xor2(builder, op->arg1, op->arg2); +        break; +    case EC_GF_OP_XOR3: +        ec_code_xor3(builder, op->arg1, op->arg2, op->arg3); +        break; +    default: +        break; +    } +} + +static void +ec_code_gf_load(ec_code_builder_t *builder, uint32_t offset) +{ +    uint32_t i; + +    for (i = 0; i < builder->code->gf->bits; i++) { +        ec_code_load(builder, i, offset); +    } +} + +static void +ec_code_gf_load_xor(ec_code_builder_t *builder, uint32_t offset) +{ +    uint32_t i; + +    for (i = 0; i < builder->code->gf->bits; i++) { +        ec_code_xorm(builder, i, offset); +    } +} + +static void +ec_code_gf_store(ec_code_builder_t *builder) +{ +    uint32_t i; + +    for (i = 0; i < builder->code->gf->bits; i++) { +        ec_code_store(builder, i, i); +    } +} + +static void +ec_code_gf_clear(ec_code_builder_t *builder) +{ +    uint32_t i; + +    ec_code_xor2(builder, 0, 0); +    for (i = 0; i < builder->code->gf->bits; i++) { +        ec_code_store(builder, 0, i); +    } +} + +static void +ec_code_gf_mul(ec_code_builder_t *builder, uint32_t value) +{ +    ec_gf_mul_t *mul; +    ec_gf_op_t *op; +    uint32_t map[EC_GF_MAX_REGS]; +    int32_t i; + +    mul = builder->code->gf->table[value]; +    for (op = mul->ops; op->op != EC_GF_OP_END; op++) { +        ec_code_dup(builder, op); +    } + +    for (i = 0; i < mul->regs; i++) { +        map[i] = builder->map[mul->map[i]]; +    } +    memcpy(builder->map, map, sizeof(uint32_t) * mul->regs); +} + +static ec_code_builder_t * +ec_code_prepare(ec_code_t *code, uint32_t count, uint32_t width, +                gf_boolean_t linear) +{ +    ec_code_builder_t *builder; +    uint32_t i; + +    count *= code->gf->bits + code->gf->max_ops; +    count += code->gf->bits; +    builder = GF_MALLOC(sizeof(ec_code_builder_t) + +                        sizeof(ec_code_op_t) * count, ec_mt_ec_code_builder_t); +    if (builder == NULL) { +        return NULL; +    } + +    builder->address = 0; +    builder->code = code; +    builder->size = 0; +    builder->count = 0; +    builder->regs = 0; +    builder->error = 0; +    builder->bits = code->gf->bits; +    builder->width = width; +    builder->data = NULL; +    builder->linear = linear; +    builder->base = -1; + +    for (i = 0; i < EC_GF_MAX_REGS; i++) { +        builder->map[i] = i; +    } + +    return builder; +} + +static size_t +ec_code_space_size(void) +{ +    return (sizeof(ec_code_space_t) + 15) & ~15; +} + +static size_t +ec_code_chunk_size(void) +{ +    return (sizeof(ec_code_chunk_t) + 15) & ~15; +} + +static ec_code_chunk_t * +ec_code_chunk_from_space(ec_code_space_t *space) +{ +    return (ec_code_chunk_t *)((uintptr_t)space + ec_code_space_size()); +} + +static void * +ec_code_func_from_chunk(ec_code_chunk_t *chunk) +{ +    return (void *)((uintptr_t)chunk + ec_code_chunk_size()); +} + +static ec_code_chunk_t * +ec_code_chunk_from_func(ec_code_func_linear_t func) +{ +    return (ec_code_chunk_t *)((uintptr_t)func - ec_code_chunk_size()); +} + +static ec_code_chunk_t * +ec_code_chunk_split(ec_code_chunk_t *chunk, size_t size) +{ +    ec_code_chunk_t *extra; +    ssize_t avail; + +    avail = chunk->size - size - ec_code_chunk_size(); +    if (avail > 0) { +        extra = (ec_code_chunk_t *)((uintptr_t)chunk + chunk->size - avail); +        extra->size = avail; +        list_add(&extra->list, &chunk->list); +        chunk->size = size; +    } +    list_del_init(&chunk->list); + +    return chunk; +} + +static gf_boolean_t +ec_code_chunk_touch(ec_code_chunk_t *prev, ec_code_chunk_t *next) +{ +    uintptr_t end; + +    end = (uintptr_t)prev + ec_code_chunk_size() + prev->size; +    return (end == (uintptr_t)next); +} + +static void +ec_code_chunk_merge(ec_code_chunk_t *chunk) +{ +    ec_code_chunk_t *item; + +    list_for_each_entry(item, &chunk->space->chunks, list) { +        if (ec_code_chunk_touch(item, chunk)) { +            item->size += chunk->size + ec_code_chunk_size(); +            chunk = item; + +            goto check; +        } +        if ((uintptr_t)item > (uintptr_t)chunk) { +            list_add_tail(&chunk->list, &item->list); +            if (ec_code_chunk_touch(chunk, item)) { +                chunk->size += item->size + ec_code_chunk_size(); +                list_del_init(&item->list); +            } + +            goto check; +        } +    } +    list_add_tail(&chunk->list, &chunk->space->chunks); + +check: +    if (chunk->size == EC_CODE_SIZE - ec_code_space_size() - +                                      ec_code_chunk_size()) { +        list_del_init(&chunk->space->list); + +        munmap(chunk->space, chunk->space->size); +    } +} + +static ec_code_chunk_t * +ec_code_space_alloc(ec_code_t *code, size_t size) +{ +    ec_code_space_t *space; +    ec_code_chunk_t *chunk; +    size_t map_size; + +    size = (size + 15) & ~15; +    list_for_each_entry(space, &code->spaces, list) { +        list_for_each_entry(chunk, &space->chunks, list) { +            if (chunk->size >= size) { +                goto out; +            } +        } +    } + +    map_size = EC_CODE_SIZE; +    if (map_size < size) { +        map_size = size; +    } +    space = mmap(NULL, map_size, PROT_EXEC | PROT_READ | PROT_WRITE, +                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +    if (space == NULL) { +        return NULL; +    } +    /* It's not important to check the return value of mlock(). If it fails +     * everything will continue to work normally. */ +    mlock(space, map_size); + +    space->code = code; +    space->size = map_size; +    list_add_tail(&space->list, &code->spaces); +    INIT_LIST_HEAD(&space->chunks); + +    chunk = ec_code_chunk_from_space(space); +    chunk->size = EC_CODE_SIZE - ec_code_space_size() - ec_code_chunk_size(); +    list_add(&chunk->list, &space->chunks); + +out: +    chunk->space = space; + +    return ec_code_chunk_split(chunk, size); +} + +static ec_code_chunk_t * +ec_code_alloc(ec_code_t *code, uint32_t size) +{ +    ec_code_chunk_t *chunk; + +    LOCK(&code->lock); + +    chunk = ec_code_space_alloc(code, size); + +    UNLOCK(&code->lock); + +    return chunk; +} + +static void +ec_code_free(ec_code_chunk_t *chunk) +{ +    gf_lock_t *lock; + +    lock = &chunk->space->code->lock; +    LOCK(lock); + +    ec_code_chunk_merge(chunk); + +    UNLOCK(lock); +} + +static gf_boolean_t +ec_code_write(ec_code_builder_t *builder) +{ +    ec_code_gen_t *gen; +    ec_code_op_t *op; +    uint32_t i; + +    builder->error = 0; +    builder->size = 0; +    builder->address = 0; +    builder->base = -1; + +    gen = builder->code->gen; +    gen->prolog(builder); +    for (i = 0; i < builder->count; i++) { +        op = &builder->ops[i]; +        switch (op->op) { +        case EC_GF_OP_LOAD: +            gen->load(builder, op->arg1.value, op->arg2.value, op->arg3.value); +            break; +        case EC_GF_OP_STORE: +            gen->store(builder, op->arg1.value, op->arg3.value); +            break; +        case EC_GF_OP_COPY: +            gen->copy(builder, op->arg1.value, op->arg2.value); +            break; +        case EC_GF_OP_XOR2: +            gen->xor2(builder, op->arg1.value, op->arg2.value); +            break; +        case EC_GF_OP_XOR3: +            gen->xor3(builder, op->arg1.value, op->arg2.value, op->arg3.value); +            break; +        case EC_GF_OP_XORM: +            gen->xorm(builder, op->arg1.value, op->arg2.value, op->arg3.value); +            break; +        default: +            break; +        } +    } +    gen->epilog(builder); + +    return builder->error == 0; +} + +static void * +ec_code_compile(ec_code_builder_t *builder) +{ +    ec_code_chunk_t *chunk; +    void *func; + +    if (!ec_code_write(builder)) { +        return NULL; +    } + +    chunk = ec_code_alloc(builder->code, builder->size); +    if (chunk == NULL) { +        return NULL; +    } +    func = ec_code_func_from_chunk(chunk); +    builder->data = (uint8_t *)func; + +    if (!ec_code_write(builder)) { +        ec_code_free(chunk); + +        return NULL; +    } + +    GF_FREE(builder); + +    return func; +} + +ec_code_t * +ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen) +{ +    ec_code_t *code; + +    code = GF_MALLOC(sizeof(ec_code_t), ec_mt_ec_code_t); +    if (code == NULL) { +        return NULL; +    } +    memset(code, 0, sizeof(ec_code_t)); +    INIT_LIST_HEAD(&code->spaces); +    LOCK_INIT(&code->lock); + +    code->gf = gf; +    code->gen = gen; +    if (gen == NULL) { +        code->width = sizeof(uint64_t); +    } else { +        code->width = gen->width; +    } + +    return code; +} + +void +ec_code_destroy(ec_code_t *code) +{ +    if (!list_empty(&code->spaces)) { +    } + +    LOCK_DESTROY(&code->lock); + +    GF_FREE(code); +} + +static uint32_t +ec_code_value_next(uint32_t *values, uint32_t count, uint32_t *offset) +{ +    uint32_t i, next; + +    next = 0; +    for (i = *offset + 1; i < count; i++) { +        next = values[i]; +        if (next != 0) { +            break; +        } +    } +    *offset = i; + +    return next; +} + +void * +ec_code_build(ec_code_t *code, uint32_t width, uint32_t *values, +              uint32_t count, gf_boolean_t linear) +{ +    ec_code_builder_t *builder; +    uint32_t offset, val, next; + +    if (code->gen == NULL) { +        ec_code_c_prepare(code->gf, values, count); +        if (linear) { +            return ec_code_c_linear; +        } else { +            return ec_code_c_interleaved; +        } +    } + +    builder = ec_code_prepare(code, count, width, linear); +    if (builder == NULL) { +        return NULL; +    } + +    offset = -1; +    next = ec_code_value_next(values, count, &offset); +    if (next != 0) { +        ec_code_gf_load(builder, offset); +        do { +            val = next; +            next = ec_code_value_next(values, count, &offset); +            if (next != 0) { +                ec_code_gf_mul(builder, ec_gf_div(code->gf, val, next)); +                ec_code_gf_load_xor(builder, offset); +            } +        } while (next != 0); +        ec_code_gf_mul(builder, val); +        ec_code_gf_store(builder); +    } else { +        ec_code_gf_clear(builder); +    } + +    return ec_code_compile(builder); +} + +ec_code_func_linear_t +ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values, +                     uint32_t count) +{ +    return (ec_code_func_linear_t)ec_code_build(code, width, values, count, +                                                _gf_true); +} + +ec_code_func_interleaved_t +ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values, +                          uint32_t count) +{ +    return (ec_code_func_interleaved_t)ec_code_build(code, width, values, +                                                     count, _gf_false); +} + +void +ec_code_release(ec_code_t *code, ec_code_func_t *func) +{ +    if (code->gen != NULL) { +        ec_code_free(ec_code_chunk_from_func(func->linear)); +    } +} + +void +ec_code_error(ec_code_builder_t *builder, int32_t error) +{ +    if (builder->error == 0) { +        builder->error = error; +    } +} + +void +ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count) +{ +    if (builder->error != 0) { +        return; +    } + +    if (builder->data != NULL) { +        memcpy(builder->data + builder->size, bytes, count); +    } + +    builder->size += count; +    builder->address += count; +} + +static char * +ec_code_proc_trim_left(char *text, ssize_t *length) +{ +    ssize_t len; + +    for (len = *length; (len > 0) && isspace(*text); len--) { +        text++; +    } +    *length = len; + +    return text; +} + +static char * +ec_code_proc_trim_right(char *text, ssize_t *length, char sep) +{ +    char *last; +    ssize_t len; + +    len = *length; + +    last = text; +    for (len = *length; (len > 0) && (*text != sep); len--) { +        if (!isspace(*text)) { +            last = text + 1; +        } +        text++; +    } +    *last = 0; +    *length = len; + +    return text; +} + +static char * +ec_code_proc_line_parse(ec_code_proc_t *file, ssize_t *length) +{ +    char *text, *end; +    ssize_t len; + +    len = file->size - file->pos; +    text = ec_code_proc_trim_left(file->buffer + file->pos, &len); +    end = ec_code_proc_trim_right(text, &len, '\n'); +    if (len == 0) { +        if (!file->eof) { +            if (text == file->buffer) { +                file->size = file->pos = 0; +                file->skip = _gf_true; +            } else { +                file->size = file->pos = end - text; +                memmove(file->buffer, text, file->pos + 1); +            } +            len = sys_read(file->fd, file->buffer + file->pos, +                           sizeof(file->buffer) - file->pos - 1); +            if (len > 0) { +                file->size += len; +            } +            file->error = len < 0; +            file->eof = len <= 0; + +            return NULL; +        } +        file->size = file->pos = 0; +    } else { +        file->pos = end - file->buffer + 1; +    } + +    *length = end - text; + +    if (file->skip) { +        file->skip = _gf_false; +        text = NULL; +    } + +    return text; +} + +static char * +ec_code_proc_line(ec_code_proc_t *file, ssize_t *length) +{ +    char *text; + +    text = NULL; +    while (!file->eof) { +        text = ec_code_proc_line_parse(file, length); +        if (text != NULL) { +            break; +        } +    } + +    return text; +} + +static char * +ec_code_proc_split(char *text, ssize_t *length, char sep) +{ +    text = ec_code_proc_trim_right(text, length, sep); +    if (*length == 0) { +        return NULL; +    } +    (*length)--; +    text++; + +    return ec_code_proc_trim_left(text, length); +} + +static uint32_t +ec_code_cpu_check(uint32_t idx, char *list, uint32_t count) +{ +    ec_code_gen_t *gen; +    char **ptr; +    char *table[count]; +    uint32_t i; + +    for (i = 0; i < count; i++) { +        table[i] = list; +        list += strlen(list) + 1; +    } + +    gen = ec_code_gen_table[idx]; +    while (gen != NULL) { +        for (ptr = gen->flags; *ptr != NULL; ptr++) { +            for (i = 0; i < count; i++) { +                if (strcmp(*ptr, table[i]) == 0) { +                    break; +                } +            } +            if (i >= count) { +                gen = ec_code_gen_table[++idx]; +                break; +            } +        } +        if (*ptr == NULL) { +            break; +        } +    } + +    return idx; +} + +ec_code_gen_t * +ec_code_detect(xlator_t *xl, const char *def) +{ +    ec_code_proc_t file; +    ec_code_gen_t *gen = NULL; +    char *line, *data, *list; +    ssize_t length; +    uint32_t count, base, select; + +    if (strcmp(def, "none") == 0) { +        gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE, +               "Not using any cpu extensions"); + +        return NULL; +    } + +    file.fd = sys_openat(AT_FDCWD, PROC_CPUINFO, O_RDONLY); +    if (file.fd < 0) { +        goto out; +    } +    file.size = file.pos = 0; +    file.eof = file.error = file.skip = _gf_false; + +    select = 0; +    if (strcmp(def, "auto") != 0) { +        while (ec_code_gen_table[select] != NULL) { +            if (strcmp(ec_code_gen_table[select]->name, def) == 0) { +                break; +            } +            select++; +        } +        if (ec_code_gen_table[select] == NULL) { +            gf_msg(xl->name, GF_LOG_WARNING, EINVAL, EC_MSG_EXTENSION_UNKNOWN, +                   "CPU extension '%s' is not known. Not using any cpu " +                   "extensions", def); + +            return NULL; +        } +    } else { +        def = NULL; +    } + +    while ((line = ec_code_proc_line(&file, &length)) != NULL) { +        data = ec_code_proc_split(line, &length, ':'); +        if ((data != NULL) && (strcmp(line, "flags") == 0)) { +            list = data; +            count = 0; +            while ((data != NULL) && (*data != 0)) { +                count++; +                data = ec_code_proc_split(data, &length, ' '); +            } +            base = select; +            select = ec_code_cpu_check(select, list, count); +            if ((base != select) && (def != NULL)) { +                gf_msg(xl->name, GF_LOG_WARNING, ENOTSUP, +                       EC_MSG_EXTENSION_UNSUPPORTED, +                       "CPU extension '%s' is not supported", def); +                def = NULL; +            } +        } +    } + +    if (file.error) { +        gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_EXTENSION_FAILED, +               "Unable to detemine supported CPU extensions. Not using any " +               "cpu extensions"); + +        gen = NULL; +    } else { +        gen = ec_code_gen_table[select]; +        if (gen == NULL) { +            gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE, +                   "Not using any cpu extensions"); +        } else { +            gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION, +                   "Using '%s' CPU extensions", gen->name); +        } +    } + +    sys_close(file.fd); + +out: +    return gen; +} diff --git a/xlators/cluster/ec/src/ec-code.h b/xlators/cluster/ec/src/ec-code.h new file mode 100644 index 00000000000..355209c3944 --- /dev/null +++ b/xlators/cluster/ec/src/ec-code.h @@ -0,0 +1,44 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_CODE_H__ +#define __EC_CODE_H__ + +#include "xlator.h" +#include "list.h" + +#include "ec-types.h" +#include "ec-galois.h" + +ec_code_gen_t * +ec_code_detect(xlator_t *xl, const char *def); + +ec_code_t * +ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen); + +void +ec_code_destroy(ec_code_t *code); + +ec_code_func_linear_t +ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values, +                     uint32_t count); +ec_code_func_interleaved_t +ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values, +                          uint32_t count); +void +ec_code_release(ec_code_t *code, ec_code_func_t *func); + +void +ec_code_error(ec_code_builder_t *builder, int32_t error); + +void +ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count); + +#endif /* __EC_CODE_H__ */ diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 6e4b975c248..f949dbd0c9f 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -13,7 +13,7 @@  #include "libxlator.h"  #include "byte-order.h" -#include "ec-data.h" +#include "ec-types.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h" diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 2e6759a2803..fd6bdf7bb9d 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -12,7 +12,7 @@  #include "hashfn.h"  #include "ec-mem-types.h" -#include "ec-data.h" +#include "ec-types.h"  #include "ec-helpers.h"  #include "ec-combine.h"  #include "ec-common.h" diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 4a2a11f4ccd..965bc1e25ef 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -11,312 +11,7 @@  #ifndef __EC_DATA_H__  #define __EC_DATA_H__ -#include "xlator.h" - -#include "ec.h" - -struct _ec_config; -typedef struct _ec_config ec_config_t; - -struct _ec_fd; -typedef struct _ec_fd ec_fd_t; - -struct _ec_inode; -typedef struct _ec_inode ec_inode_t; - -union _ec_cbk; -typedef union _ec_cbk ec_cbk_t; - -struct _ec_lock; -typedef struct _ec_lock ec_lock_t; - -struct _ec_lock_link; -typedef struct _ec_lock_link ec_lock_link_t; - -struct _ec_fop_data; -typedef struct _ec_fop_data ec_fop_data_t; - -struct _ec_cbk_data; -typedef struct _ec_cbk_data ec_cbk_data_t; - -struct _ec_heal; -typedef struct _ec_heal ec_heal_t; - -typedef void (* ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t); -typedef int32_t (* ec_handler_f)(ec_fop_data_t *, int32_t); -typedef void (* ec_resume_f)(ec_fop_data_t *, int32_t); - -struct _ec_config -{ -    uint32_t version; -    uint8_t  algorithm; -    uint8_t  gf_word_size; -    uint8_t  bricks; -    uint8_t  redundancy; -    uint32_t chunk_size; -}; - -struct _ec_fd -{ -    loc_t     loc; -    uintptr_t open; -    int32_t   flags; -}; - -struct _ec_inode -{ -    ec_lock_t        *inode_lock; -    gf_boolean_t      have_info; -    gf_boolean_t      have_config; -    gf_boolean_t      have_version; -    gf_boolean_t      have_size; -    ec_config_t       config; -    uint64_t          pre_version[2]; -    uint64_t          post_version[2]; -    uint64_t          pre_size; -    uint64_t          post_size; -    uint64_t          dirty[2]; -    struct list_head  heal; -}; - -typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, -                                   int32_t, int32_t, uintptr_t, uintptr_t, -                                   uintptr_t, dict_t *); -typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, -                                    int32_t, int32_t, uintptr_t, uintptr_t, -                                    uintptr_t, dict_t *); - -union _ec_cbk -{ -    fop_access_cbk_t       access; -    fop_create_cbk_t       create; -    fop_discard_cbk_t      discard; -    fop_entrylk_cbk_t      entrylk; -    fop_fentrylk_cbk_t     fentrylk; -    fop_fallocate_cbk_t    fallocate; -    fop_flush_cbk_t        flush; -    fop_fsync_cbk_t        fsync; -    fop_fsyncdir_cbk_t     fsyncdir; -    fop_getxattr_cbk_t     getxattr; -    fop_fgetxattr_cbk_t    fgetxattr; -    fop_heal_cbk_t         heal; -    fop_fheal_cbk_t        fheal; -    fop_inodelk_cbk_t      inodelk; -    fop_finodelk_cbk_t     finodelk; -    fop_link_cbk_t         link; -    fop_lk_cbk_t           lk; -    fop_lookup_cbk_t       lookup; -    fop_mkdir_cbk_t        mkdir; -    fop_mknod_cbk_t        mknod; -    fop_open_cbk_t         open; -    fop_opendir_cbk_t      opendir; -    fop_readdir_cbk_t      readdir; -    fop_readdirp_cbk_t     readdirp; -    fop_readlink_cbk_t     readlink; -    fop_readv_cbk_t        readv; -    fop_removexattr_cbk_t  removexattr; -    fop_fremovexattr_cbk_t fremovexattr; -    fop_rename_cbk_t       rename; -    fop_rmdir_cbk_t        rmdir; -    fop_setattr_cbk_t      setattr; -    fop_fsetattr_cbk_t     fsetattr; -    fop_setxattr_cbk_t     setxattr; -    fop_fsetxattr_cbk_t    fsetxattr; -    fop_stat_cbk_t         stat; -    fop_fstat_cbk_t        fstat; -    fop_statfs_cbk_t       statfs; -    fop_symlink_cbk_t      symlink; -    fop_truncate_cbk_t     truncate; -    fop_ftruncate_cbk_t    ftruncate; -    fop_unlink_cbk_t       unlink; -    fop_writev_cbk_t       writev; -    fop_xattrop_cbk_t      xattrop; -    fop_fxattrop_cbk_t     fxattrop; -    fop_zerofill_cbk_t     zerofill; -    fop_seek_cbk_t         seek; -}; - -struct _ec_lock -{ -    ec_inode_t        *ctx; -    gf_timer_t        *timer; - -    /* List of owners of this lock. All fops added to this list are running -     * concurrently. */ -    struct list_head   owners; - -    /* List of fops waiting to be an owner of the lock. Fops are added to this -     * list when the current owner has an incompatible access (shared vs -     * exclusive) or the lock is not acquired yet. */ -    struct list_head   waiting; - -    /* List of fops that will wait until the next unlock/lock cycle. This -     * happens when the currently acquired lock is decided to be released as -     * soon as possible. In this case, all frozen fops will be continued only -     * after the lock is reacquired. */ -    struct list_head   frozen; - -    int32_t            exclusive; -    uintptr_t          mask; -    uintptr_t          good_mask; -    uintptr_t          healing; -    uint32_t           refs_owners;  /* Refs for fops owning the lock */ -    uint32_t           refs_pending; /* Refs assigned to fops being prepared */ -    gf_boolean_t       acquired; -    gf_boolean_t       getting_size; -    gf_boolean_t       release; -    gf_boolean_t       query; -    fd_t              *fd; -    loc_t              loc; -    union -    { -        entrylk_type     type; -        struct gf_flock  flock; -    }; -}; - -struct _ec_lock_link -{ -    ec_lock_t        *lock; -    ec_fop_data_t    *fop; -    struct list_head  owner_list; -    struct list_head  wait_list; -    gf_boolean_t      update[2]; -    loc_t            *base; -    uint64_t          size; -}; - -struct _ec_fop_data -{ -    int32_t            id; -    int32_t            refs; -    int32_t            state; -    int32_t            minimum; -    int32_t            expected; -    int32_t            winds; -    int32_t            jobs; -    int32_t            error; -    ec_fop_data_t     *parent; -    xlator_t          *xl; -    call_frame_t      *req_frame;    /* frame of the calling xlator */ -    call_frame_t      *frame;        /* frame used by this fop */ -    struct list_head   cbk_list;     /* sorted list of groups of answers */ -    struct list_head   answer_list;  /* list of answers */ -    struct list_head   pending_list; /* member of ec_t.pending_fops */ -    ec_cbk_data_t     *answer;       /* accepted answer */ -    int32_t            lock_count; -    int32_t            locked; -    ec_lock_link_t     locks[2]; -    int32_t            first_lock; -    gf_lock_t          lock; - -    uint32_t           flags; -    uint32_t           first; -    uintptr_t          mask; -    uintptr_t          healing; /*Dispatch is done but call is successful only -                                  if fop->minimum number of subvolumes succeed -                                  which are not healing*/ -    uintptr_t          remaining; -    uintptr_t          received; /* Mask of responses */ -    uintptr_t          good; - -    uid_t              uid; -    gid_t              gid; - -    ec_wind_f          wind; -    ec_handler_f       handler; -    ec_resume_f        resume; -    ec_cbk_t           cbks; -    void              *data; -    ec_heal_t         *heal; -    struct list_head   healer; - -    uint64_t           user_size; -    uint32_t           head; - -    int32_t            use_fd; - -    dict_t            *xdata; -    dict_t            *dict; -    int32_t            int32; -    uint32_t           uint32; -    uint64_t           size; -    off_t              offset; -    mode_t             mode[2]; -    entrylk_cmd        entrylk_cmd; -    entrylk_type       entrylk_type; -    gf_xattrop_flags_t xattrop_flags; -    dev_t              dev; -    inode_t           *inode; -    fd_t              *fd; -    struct iatt        iatt; -    char              *str[2]; -    loc_t              loc[2]; -    struct gf_flock    flock; -    struct iovec      *vector; -    struct iobref     *buffers; -    gf_seek_what_t     seek; -}; - -struct _ec_cbk_data -{ -    struct list_head list;        // item in the sorted list of groups -    struct list_head answer_list; // item in the list of answers -    ec_fop_data_t *  fop; -    ec_cbk_data_t *  next;        // next answer in the same group -    int32_t          idx; -    int32_t          op_ret; -    int32_t          op_errno; -    int32_t          count; -    uintptr_t        mask; -    uint64_t         dirty[2]; - -    dict_t *         xdata; -    dict_t *         dict; -    int32_t          int32; -    uintptr_t        uintptr[3]; -    uint64_t         size; -    uint64_t         version[2]; -    inode_t *        inode; -    fd_t *           fd; -    struct statvfs   statvfs; -    struct iatt      iatt[5]; -    struct gf_flock  flock; -    struct iovec *   vector; -    struct iobref *  buffers; -    char            *str; -    gf_dirent_t      entries; -    off_t            offset; -    gf_seek_what_t   what; -}; - -struct _ec_heal -{ -    struct list_head  list; -    gf_lock_t         lock; -    xlator_t         *xl; -    ec_fop_data_t    *fop; -    void             *data; -    ec_fop_data_t    *lookup; -    loc_t             loc; -    struct iatt       iatt; -    char             *symlink; -    fd_t             *fd; -    int32_t           partial; -    int32_t           done; -    int32_t           error; -    gf_boolean_t      nameheal; -    uintptr_t         available; -    uintptr_t         good; -    uintptr_t         bad; -    uintptr_t         open; -    uintptr_t         fixed; -    uint64_t          offset; -    uint64_t          size; -    uint64_t          total_size; -    uint64_t          version[2]; -    uint64_t          raw_size; -}; +#include "ec-types.h"  ec_cbk_data_t * ec_cbk_data_allocate(call_frame_t * frame, xlator_t * this,                                       ec_fop_data_t * fop, int32_t id, @@ -332,4 +27,6 @@ void ec_fop_data_release(ec_fop_data_t * fop);  void ec_fop_cleanup(ec_fop_data_t *fop); +void ec_pending_fops_completed(ec_t *ec); +  #endif /* __EC_DATA_H__ */ diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index ed53a0416e0..4fe82e3c0b6 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -11,12 +11,13 @@  #include "xlator.h"  #include "defaults.h" +#include "ec.h" +#include "ec-messages.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" -#include "ec-messages.h"  /* FOP: opendir */ diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index e181170650d..1272e3dfe0d 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -11,12 +11,13 @@  #include "xlator.h"  #include "defaults.h" +#include "ec.h" +#include "ec-messages.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" -#include "ec-messages.h"  int  ec_dir_write_cbk (call_frame_t *frame, xlator_t *this, diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h index 8d938427a18..13f419b5a30 100644 --- a/xlators/cluster/ec/src/ec-fops.h +++ b/xlators/cluster/ec/src/ec-fops.h @@ -13,7 +13,7 @@  #include "xlator.h" -#include "ec-data.h" +#include "ec-types.h"  #include "ec-common.h"  void ec_access(call_frame_t * frame, xlator_t * this, uintptr_t target, diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c new file mode 100644 index 00000000000..7dbbac09713 --- /dev/null +++ b/xlators/cluster/ec/src/ec-galois.c @@ -0,0 +1,185 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <string.h> + +#include "mem-pool.h" +#include "list.h" + +#include "ec-mem-types.h" +#include "ec-gf8.h" + +static ec_gf_t * +ec_gf_alloc(uint32_t bits, uint32_t mod) +{ +    ec_gf_t *gf; + +    gf = GF_MALLOC(sizeof(ec_gf_t), ec_mt_ec_gf_t); +    if (gf == NULL) { +        goto failed; +    } + +    gf->bits = bits; +    gf->size = 1 << bits; +    gf->mod = mod; + +    gf->log = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1), +                        gf_common_mt_int); +    if (gf->log == NULL) { +        goto failed_gf; +    } +    gf->pow = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1), +                        gf_common_mt_int); +    if (gf->pow == NULL) { +        goto failed_log; +    } + +    return gf; + +failed_log: +    GF_FREE(gf->log); +failed_gf: +    GF_FREE(gf); +failed: +    return NULL; +} + +static void +ec_gf_init_tables(ec_gf_t *gf) +{ +    uint32_t i, tmp; + +    memset(gf->log, -1, sizeof(uint32_t) * gf->size); + +    gf->pow[0] = 1; +    gf->log[0] = gf->size; +    gf->log[1] = 0; +    for (i = 1; i < gf->size; i++) { +        tmp = gf->pow[i - 1] << 1; +        if (tmp >= gf->size) { +            tmp ^= gf->mod; +        } +        gf->pow[i + gf->size - 1] = gf->pow[i] = tmp; +        gf->log[tmp + gf->size - 1] = gf->log[tmp] = i; +    } +} + +ec_gf_t * +ec_gf_prepare(uint32_t bits, uint32_t mod) +{ +    ec_gf_mul_t **tbl; +    ec_gf_t *gf; +    uint32_t i, j; + +    if (bits != 8) { +        return NULL; +    } + +    tbl = ec_gf8_mul; +    if (mod == 0) { +        mod = 0x11d; +    } + +    gf = ec_gf_alloc(bits, mod); +    if (gf == NULL) { +        return NULL; +    } +    ec_gf_init_tables(gf); + +    gf->table = tbl; +    gf->min_ops = bits * bits; +    gf->max_ops = 0; +    gf->avg_ops = 0; +    for (i = 1; i < gf->size; i++) { +        for (j = 0; tbl[i]->ops[j].op != EC_GF_OP_END; j++) { +        } +        if (gf->max_ops < j) { +            gf->max_ops = j; +        } +        if (gf->min_ops > j) { +            gf->min_ops = j; +        } +        gf->avg_ops += j; +    } +    gf->avg_ops /= gf->size; + +    return gf; +} + +void +ec_gf_destroy(ec_gf_t *gf) +{ +    GF_FREE(gf->pow); +    GF_FREE(gf->log); +    GF_FREE(gf); +} + +uint32_t +ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b) +{ +    if ((a >= gf->size) || (b >= gf->size)) { +        return gf->size; +    } + +    return a ^ b; +} + +uint32_t +ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b) +{ +    if ((a >= gf->size) || (b >= gf->size)) { +        return gf->size; +    } + +    if ((a != 0) && (b != 0)) { +        return gf->pow[gf->log[a] + gf->log[b]]; +    } + +    return 0; +} + +uint32_t +ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b) +{ +    if ((a >= gf->size) || (b >= gf->size)) { +        return gf->size; +    } + +    if (b != 0) { +        if (a != 0) { +            return gf->pow[gf->size - 1 + gf->log[a] - gf->log[b]]; +        } + +        return 0; +    } + +    return gf->size; +} + +uint32_t +ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b) +{ +    uint32_t r; + +    if ((a >= gf->size) || ((a == 0) && (b == 0))) { +        return gf->size; +    } + +    r = 1; +    while (b != 0) { +        if ((b & 1) != 0) { +            r = ec_gf_mul(gf, r, a); +        } +        a = ec_gf_mul(gf, a, a); +        b >>= 1; +    } + +    return r; +} diff --git a/xlators/cluster/ec/src/ec-galois.h b/xlators/cluster/ec/src/ec-galois.h new file mode 100644 index 00000000000..02e6b6c1bc2 --- /dev/null +++ b/xlators/cluster/ec/src/ec-galois.h @@ -0,0 +1,26 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_GALOIS_H__ +#define __EC_GALOIS_H__ + +#include <inttypes.h> + +#include "ec-types.h" + +ec_gf_t *ec_gf_prepare(uint32_t bits, uint32_t mod); +void ec_gf_destroy(ec_gf_t *gf); + +uint32_t ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b); +uint32_t ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b); + +#endif /* __EC_GALOIS_H__ */ diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 0ad514908aa..d67420469a8 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -10,14 +10,15 @@  #include "xlator.h"  #include "defaults.h" +#include "byte-order.h" +#include "ec.h" +#include "ec-messages.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" -#include "ec-messages.h" -#include "byte-order.h"  /* FOP: flush */ diff --git a/xlators/cluster/ec/src/ec-gf.c b/xlators/cluster/ec/src/ec-gf.c deleted file mode 100644 index 1ae8928f20b..00000000000 --- a/xlators/cluster/ec/src/ec-gf.c +++ /dev/null @@ -1,11635 +0,0 @@ -/* -  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> -  This file is part of GlusterFS. - -  This file is licensed to you under your choice of the GNU Lesser -  General Public License, version 3 or any later version (LGPLv3 or -  later), or the GNU General Public License, version 2 (GPLv2), in all -  cases as published by the Free Software Foundation. -*/ - -#include <inttypes.h> -#include <string.h> - -#include "ec-gf.h" - -static void gf8_muladd_00(uint8_t * out, uint8_t * in, unsigned int width) -{ -    memcpy(out, in, sizeof(uint64_t) * 8 * width); -} - -static void gf8_muladd_01(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        out_ptr[0] ^= in_ptr[0]; -        out_ptr[width] ^= in_ptr[width]; -        out_ptr[width * 2] ^= in_ptr[width * 2]; -        out_ptr[width * 3] ^= in_ptr[width * 3]; -        out_ptr[width * 4] ^= in_ptr[width * 4]; -        out_ptr[width * 5] ^= in_ptr[width * 5]; -        out_ptr[width * 6] ^= in_ptr[width * 6]; -        out_ptr[width * 7] ^= in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_02(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in7; -        out1 = in0; -        out7 = in6; -        out5 = in4; -        out6 = in5; -        out3 = in2 ^ in7; -        out4 = in3 ^ in7; -        out2 = in1 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_03(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in0 ^ in7; -        tmp0 = in2 ^ in7; -        out1 = in0 ^ in1; -        out7 = in6 ^ in7; -        out5 = in4 ^ in5; -        out6 = in5 ^ in6; -        out4 = in3 ^ in4 ^ in7; -        out2 = tmp0 ^ in1; -        out3 = tmp0 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_04(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in6; -        out1 = in7; -        out7 = in5; -        out6 = in4; -        tmp0 = in6 ^ in7; -        out2 = in0 ^ in6; -        out5 = in3 ^ in7; -        out3 = tmp0 ^ in1; -        out4 = tmp0 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_05(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in0 ^ in6; -        out1 = in1 ^ in7; -        out7 = in5 ^ in7; -        out6 = in4 ^ in6; -        out2 = out0 ^ in2; -        out3 = out1 ^ in3 ^ in6; -        out5 = out7 ^ in3; -        out4 = out6 ^ in2 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_06(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in6 ^ in7; -        tmp0 = in1 ^ in6; -        out1 = in0 ^ in7; -        out7 = in5 ^ in6; -        out6 = in4 ^ in5; -        out4 = in2 ^ in3 ^ in6; -        out5 = in3 ^ in4 ^ in7; -        out3 = tmp0 ^ in2; -        out2 = tmp0 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_07(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in6; -        tmp1 = in5 ^ in6; -        tmp2 = in0 ^ in7; -        tmp3 = tmp0 ^ in3; -        out6 = tmp1 ^ in4; -        out7 = tmp1 ^ in7; -        out0 = tmp2 ^ in6; -        out1 = tmp2 ^ in1; -        out3 = tmp3 ^ in1; -        out4 = tmp3 ^ in4; -        out5 = out4 ^ out7 ^ in2; -        out2 = tmp0 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_08(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in5; -        out1 = in6; -        out7 = in4; -        out6 = in3 ^ in7; -        out3 = in0 ^ in5 ^ in6; -        out5 = in2 ^ in6 ^ in7; -        out2 = in5 ^ in7; -        out4 = out2 ^ in1 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_09(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in0 ^ in5; -        tmp0 = in3 ^ in6; -        out1 = in1 ^ in6; -        out7 = in4 ^ in7; -        out2 = in2 ^ in5 ^ in7; -        out3 = tmp0 ^ out0; -        out6 = tmp0 ^ in7; -        out4 = out1 ^ out7 ^ in5; -        out5 = out2 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_0A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in5 ^ in7; -        out1 = in0 ^ in6; -        out7 = in4 ^ in6; -        out2 = in1 ^ in5; -        out6 = out0 ^ in3; -        out3 = out0 ^ out1 ^ in2; -        out5 = out7 ^ in2 ^ in7; -        out4 = out2 ^ in3 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_0B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        tmp1 = in0 ^ in6; -        tmp2 = in4 ^ in7; -        out0 = in0 ^ in5 ^ in7; -        out2 = tmp0 ^ in1; -        out1 = tmp1 ^ in1; -        out6 = tmp1 ^ out0 ^ in3; -        out7 = tmp2 ^ in6; -        out4 = tmp2 ^ out6 ^ in1; -        out3 = out6 ^ in0 ^ in2; -        out5 = tmp0 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_0C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in5 ^ in6; -        out1 = in6 ^ in7; -        out7 = in4 ^ in5; -        tmp0 = in1 ^ in5; -        tmp1 = in0 ^ in7; -        out5 = in2 ^ in3 ^ in6; -        out6 = in3 ^ in4 ^ in7; -        out2 = tmp1 ^ out0; -        out4 = tmp0 ^ in2; -        out3 = tmp0 ^ tmp1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_0D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in5; -        tmp1 = in5 ^ in6; -        out1 = in1 ^ in6 ^ in7; -        out7 = tmp0 ^ in7; -        out4 = tmp0 ^ in1 ^ in2; -        out0 = tmp1 ^ in0; -        tmp2 = tmp1 ^ in3; -        out6 = tmp2 ^ out7; -        out2 = out0 ^ in2 ^ in7; -        out3 = out0 ^ out1 ^ in3; -        out5 = tmp2 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_0E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in2 ^ in5; -        tmp2 = in5 ^ in6; -        out1 = in0 ^ in6 ^ in7; -        out3 = tmp0 ^ tmp1; -        out2 = tmp0 ^ tmp2; -        tmp3 = tmp1 ^ in3; -        out7 = tmp2 ^ in4; -        out0 = tmp2 ^ in7; -        out4 = tmp3 ^ in1 ^ in7; -        out5 = tmp3 ^ out7; -        out6 = out0 ^ out5 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_0F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in6 ^ in7; -        tmp1 = tmp0 ^ in1; -        tmp2 = tmp0 ^ in5; -        out1 = tmp1 ^ in0; -        out7 = tmp2 ^ in4; -        out0 = tmp2 ^ in0; -        out6 = out7 ^ in3; -        out5 = out6 ^ in2 ^ in7; -        tmp3 = tmp1 ^ out0 ^ in2; -        out4 = tmp1 ^ out5; -        out2 = tmp3 ^ in6; -        out3 = tmp3 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_10(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in4; -        out1 = in5; -        out7 = in3 ^ in7; -        tmp0 = in6 ^ in7; -        out2 = in4 ^ in6; -        tmp1 = out2 ^ in5; -        out6 = tmp0 ^ in2; -        out3 = tmp0 ^ tmp1; -        out5 = out2 ^ out3 ^ in1; -        out4 = tmp1 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_11(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out7 = in3; -        out0 = in0 ^ in4; -        out1 = in1 ^ in5; -        out6 = in2 ^ in7; -        out4 = in0 ^ in5 ^ in6; -        out5 = in1 ^ in6 ^ in7; -        out2 = in2 ^ in4 ^ in6; -        out3 = in3 ^ in4 ^ in5 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_12(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in4 ^ in7; -        out1 = in0 ^ in5; -        out3 = in2 ^ in4 ^ in5; -        tmp0 = out0 ^ in6; -        out2 = tmp0 ^ in1; -        tmp1 = tmp0 ^ in3; -        out6 = tmp0 ^ out3; -        out5 = out2 ^ in5; -        out7 = tmp1 ^ in4; -        out4 = tmp1 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_13(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out7 = in3 ^ in6; -        tmp0 = in0 ^ in5; -        tmp1 = in4 ^ in7; -        out6 = in2 ^ in5 ^ in7; -        out4 = tmp0 ^ out7 ^ in7; -        out1 = tmp0 ^ in1; -        out0 = tmp1 ^ in0; -        out5 = tmp1 ^ in1 ^ in6; -        out3 = tmp1 ^ out6 ^ in3; -        out2 = out5 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_14(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in4 ^ in6; -        out1 = in5 ^ in7; -        out2 = in0 ^ in4; -        tmp0 = out0 ^ in5; -        out7 = out1 ^ in3; -        tmp1 = out1 ^ in2; -        out3 = tmp0 ^ in1; -        out6 = tmp0 ^ tmp1; -        out4 = tmp1 ^ out2; -        out5 = out3 ^ in3 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_15(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out7 = in3 ^ in5; -        tmp0 = in0 ^ in4; -        out1 = in1 ^ in5 ^ in7; -        out5 = in1 ^ in3 ^ in6; -        out0 = tmp0 ^ in6; -        out2 = tmp0 ^ in2; -        out3 = out5 ^ in4 ^ in5; -        out6 = out2 ^ in0 ^ in7; -        out4 = tmp0 ^ out6 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_16(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in5; -        tmp1 = in4 ^ in7; -        tmp2 = in2 ^ in3 ^ in4; -        out1 = tmp0 ^ in7; -        out4 = tmp0 ^ tmp2; -        out0 = tmp1 ^ in6; -        tmp3 = tmp1 ^ in1; -        out6 = out0 ^ in2 ^ in5; -        out2 = tmp3 ^ in0; -        out3 = out6 ^ in1; -        out7 = tmp2 ^ out6; -        out5 = tmp3 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_17(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        tmp1 = in3 ^ in6; -        tmp2 = tmp0 ^ in4; -        out4 = tmp0 ^ in0 ^ in3; -        out7 = tmp1 ^ in5; -        tmp3 = tmp1 ^ in1; -        out6 = tmp2 ^ in7; -        out5 = tmp3 ^ in4; -        out3 = tmp3 ^ out6; -        out0 = out3 ^ out4 ^ in1; -        out2 = out3 ^ out7 ^ in0; -        out1 = tmp2 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_18(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in4 ^ in5; -        out1 = in5 ^ in6; -        tmp0 = in4 ^ in7; -        out5 = in1 ^ in2 ^ in5; -        out6 = in2 ^ in3 ^ in6; -        out2 = tmp0 ^ out1; -        out7 = tmp0 ^ in3; -        tmp1 = tmp0 ^ in0; -        out3 = tmp1 ^ in6; -        out4 = tmp1 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_19(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in1 ^ in2; -        out7 = in3 ^ in4; -        tmp0 = in0 ^ in7; -        out6 = in2 ^ in3; -        out1 = in1 ^ in5 ^ in6; -        out0 = in0 ^ in4 ^ in5; -        out4 = tmp0 ^ in1; -        tmp1 = tmp0 ^ in6; -        out2 = tmp1 ^ out0 ^ in2; -        out3 = tmp1 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_1A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in5; -        tmp1 = in5 ^ in6; -        tmp2 = tmp0 ^ in1; -        out0 = tmp0 ^ in7; -        out1 = tmp1 ^ in0; -        tmp3 = tmp1 ^ in3; -        out5 = tmp2 ^ in2; -        out2 = tmp2 ^ in6; -        out7 = tmp3 ^ out0; -        out6 = tmp3 ^ in2; -        out4 = tmp3 ^ out2 ^ in0; -        out3 = tmp0 ^ out1 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_1B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in4; -        tmp1 = in2 ^ in5; -        tmp2 = in3 ^ in6; -        out5 = tmp0 ^ in1; -        tmp3 = tmp0 ^ in0; -        out6 = tmp1 ^ in3; -        out0 = tmp1 ^ tmp3 ^ in7; -        out7 = tmp2 ^ in4; -        tmp4 = out5 ^ in6; -        out3 = tmp2 ^ tmp3; -        out2 = tmp4 ^ in5; -        out4 = tmp4 ^ out3; -        out1 = tmp3 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_1C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        tmp1 = in4 ^ in6; -        tmp2 = in5 ^ in7; -        out6 = tmp0 ^ tmp1; -        out0 = tmp1 ^ in5; -        out1 = tmp2 ^ in6; -        tmp3 = tmp2 ^ in1; -        tmp4 = tmp2 ^ in4; -        out2 = tmp4 ^ in0; -        out7 = tmp4 ^ in3; -        out5 = tmp0 ^ tmp3; -        out3 = tmp3 ^ out2; -        out4 = out3 ^ in2 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_1D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in3; -        tmp1 = in0 ^ in4; -        tmp2 = in3 ^ in4; -        tmp3 = in2 ^ in7; -        out3 = tmp0 ^ tmp1; -        out5 = tmp0 ^ tmp3; -        tmp4 = tmp1 ^ in5; -        out6 = tmp2 ^ in2; -        out7 = tmp2 ^ in5; -        out2 = tmp3 ^ tmp4; -        out4 = out3 ^ out6 ^ in6; -        out0 = tmp4 ^ in6; -        out1 = out2 ^ out4 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_1E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in4; -        tmp1 = in2 ^ in7; -        tmp2 = tmp0 ^ in1; -        out3 = tmp1 ^ tmp2; -        out2 = tmp2 ^ in5; -        out4 = out3 ^ in3 ^ in6; -        tmp3 = out4 ^ in7; -        out6 = tmp3 ^ out2 ^ in4; -        out7 = tmp1 ^ out6; -        out0 = out7 ^ in3; -        out1 = tmp0 ^ out0; -        out5 = tmp3 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_1F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in6; -        tmp1 = tmp0 ^ in5; -        out7 = tmp1 ^ in3; -        out0 = tmp1 ^ in0 ^ in7; -        out6 = out7 ^ in2 ^ in6; -        out1 = out0 ^ in1 ^ in4; -        out4 = out0 ^ out6 ^ in1; -        out3 = tmp0 ^ out4; -        out2 = out4 ^ out7 ^ in7; -        out5 = out3 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_20(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in4; -        out0 = in3 ^ in7; -        tmp0 = in3 ^ in4; -        tmp1 = in6 ^ in7; -        out2 = out0 ^ in5; -        out4 = tmp0 ^ in5; -        out3 = tmp0 ^ tmp1; -        out7 = tmp1 ^ in2; -        out6 = tmp1 ^ in1 ^ in5; -        out5 = out2 ^ out3 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_21(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in1 ^ in4; -        tmp0 = in4 ^ in6; -        out4 = in3 ^ in5; -        out7 = in2 ^ in6; -        out0 = in0 ^ in3 ^ in7; -        out6 = in1 ^ in5 ^ in7; -        out3 = tmp0 ^ in7; -        out5 = tmp0 ^ in0; -        out2 = out4 ^ in2 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_22(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in3; -        out1 = in0 ^ in4; -        out7 = in2 ^ in7; -        out4 = in4 ^ in5 ^ in7; -        out5 = in0 ^ in5 ^ in6; -        out6 = in1 ^ in6 ^ in7; -        out3 = in2 ^ in3 ^ in4 ^ in6; -        out2 = in1 ^ in3 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_23(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out7 = in2; -        out0 = in0 ^ in3; -        out4 = in5 ^ in7; -        out5 = in0 ^ in6; -        out6 = in1 ^ in7; -        out3 = in2 ^ in4 ^ in6; -        out1 = in0 ^ in1 ^ in4; -        out2 = out4 ^ out6 ^ in2 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_24(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in4 ^ in7; -        tmp0 = in3 ^ in4; -        out0 = in3 ^ in6 ^ in7; -        out3 = tmp0 ^ in1; -        tmp1 = out0 ^ in5; -        out6 = tmp1 ^ out3; -        out2 = tmp1 ^ in0; -        out7 = tmp1 ^ in2 ^ in3; -        out5 = out2 ^ in4; -        out4 = tmp0 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_25(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in1 ^ in4; -        tmp0 = in2 ^ in5; -        out1 = out3 ^ in7; -        out7 = tmp0 ^ in6; -        out6 = out1 ^ in5; -        out4 = out7 ^ in3 ^ in7; -        out2 = out4 ^ in0; -        out0 = tmp0 ^ out2; -        out5 = out0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_26(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in3 ^ in6; -        tmp0 = in4 ^ in7; -        out7 = in2 ^ in5 ^ in7; -        tmp1 = out0 ^ in0 ^ in5; -        out1 = tmp0 ^ in0; -        tmp2 = tmp0 ^ in6; -        out2 = tmp1 ^ in1; -        out5 = tmp1 ^ in7; -        out6 = tmp2 ^ in1; -        out4 = tmp2 ^ out7; -        out3 = out0 ^ out6 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_27(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out7 = in2 ^ in5; -        out0 = in0 ^ in3 ^ in6; -        out6 = in1 ^ in4 ^ in7; -        out4 = out7 ^ in6; -        out2 = out0 ^ out7 ^ in1; -        out5 = out0 ^ in7; -        out1 = out6 ^ in0; -        out3 = out6 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_28(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in3; -        out1 = in4 ^ in6; -        out0 = in3 ^ in5 ^ in7; -        tmp0 = out1 ^ in7; -        tmp1 = out0 ^ in4; -        out7 = tmp0 ^ in2; -        tmp2 = tmp0 ^ in1; -        out3 = tmp1 ^ in0; -        out6 = tmp1 ^ tmp2; -        out4 = tmp2 ^ in3; -        out5 = out3 ^ in2 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_29(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in2 ^ in3; -        tmp0 = in1 ^ in3; -        tmp1 = in4 ^ in6; -        tmp2 = in0 ^ in4 ^ in7; -        out6 = tmp0 ^ in5; -        out4 = tmp0 ^ in6 ^ in7; -        out1 = tmp1 ^ in1; -        out7 = tmp1 ^ in2; -        out3 = tmp2 ^ in5; -        out5 = tmp2 ^ in2; -        out0 = out3 ^ in3 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_2A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in3 ^ in5; -        tmp0 = in1 ^ in3; -        tmp1 = in0 ^ in4; -        out7 = in2 ^ in4 ^ in7; -        out3 = tmp1 ^ out0 ^ in2; -        out2 = tmp0 ^ in7; -        out6 = tmp0 ^ in6; -        out1 = tmp1 ^ in6; -        out5 = tmp1 ^ out7 ^ in5; -        out4 = out1 ^ in0 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_2B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in1 ^ in6; -        out7 = in2 ^ in4; -        tmp0 = in0 ^ in5; -        tmp1 = in2 ^ in7; -        out6 = in1 ^ in3; -        out1 = out4 ^ in0 ^ in4; -        out3 = tmp0 ^ out7; -        out0 = tmp0 ^ in3; -        out5 = tmp1 ^ in0; -        out2 = tmp1 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_2C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        tmp1 = in2 ^ in3 ^ in4; -        tmp2 = tmp0 ^ in6; -        out4 = tmp1 ^ in1; -        out5 = tmp1 ^ in0 ^ in5; -        tmp3 = tmp2 ^ in4; -        out6 = tmp2 ^ out4; -        out7 = tmp3 ^ in7; -        out2 = tmp3 ^ out5; -        out3 = out6 ^ in0; -        out0 = tmp1 ^ out7; -        out1 = tmp0 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_2D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        out4 = tmp0 ^ in1; -        tmp1 = tmp0 ^ in0; -        out2 = tmp1 ^ in6; -        out5 = tmp1 ^ in4; -        tmp2 = out2 ^ in2; -        tmp3 = tmp2 ^ in5; -        out0 = tmp3 ^ in7; -        out7 = tmp3 ^ out5; -        out6 = out4 ^ out7 ^ in6; -        out3 = tmp2 ^ out6; -        out1 = out0 ^ out6 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_2E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in7; -        out0 = in3 ^ in5 ^ in6; -        tmp1 = tmp0 ^ in0; -        tmp2 = tmp0 ^ in2; -        out1 = tmp1 ^ in6; -        out4 = tmp2 ^ in1; -        out7 = tmp2 ^ in5; -        out3 = out0 ^ out4 ^ in0; -        out2 = out3 ^ out7 ^ in7; -        out6 = tmp1 ^ out2; -        out5 = tmp1 ^ out7 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_2F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = in2 ^ in5; -        out4 = in1 ^ in2 ^ in7; -        out6 = in1 ^ in3 ^ in4; -        out5 = tmp0 ^ in2; -        tmp2 = tmp0 ^ in6; -        out7 = tmp1 ^ in4; -        out0 = tmp2 ^ in5; -        out2 = tmp2 ^ out4; -        out1 = tmp2 ^ out6 ^ in7; -        out3 = tmp1 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_30(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in4 ^ in5; -        tmp0 = in3 ^ in6; -        tmp1 = in4 ^ in7; -        out6 = in1 ^ in2 ^ in5; -        out3 = tmp0 ^ in5; -        out4 = tmp0 ^ in0; -        out7 = tmp0 ^ in2; -        out0 = tmp1 ^ in3; -        out2 = tmp1 ^ out3; -        out5 = tmp1 ^ in0 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_31(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in5 ^ in6; -        tmp0 = in4 ^ in5; -        tmp1 = in0 ^ in3 ^ in4; -        tmp2 = out3 ^ in2; -        out1 = tmp0 ^ in1; -        out0 = tmp1 ^ in7; -        out4 = tmp1 ^ in6; -        out6 = tmp2 ^ in1; -        out2 = tmp2 ^ out0 ^ in0; -        out5 = out1 ^ in0 ^ in7; -        out7 = tmp0 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_32(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in3 ^ in4; -        out7 = in2 ^ in3; -        tmp0 = in5 ^ in6; -        tmp1 = in0 ^ in7; -        out6 = in1 ^ in2; -        out1 = in0 ^ in4 ^ in5; -        out2 = tmp0 ^ out0 ^ in1; -        out3 = tmp0 ^ out7 ^ in7; -        out4 = tmp1 ^ in6; -        out5 = tmp1 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_33(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        tmp1 = in0 ^ in4; -        tmp2 = in1 ^ in5; -        out6 = in1 ^ in2 ^ in6; -        out7 = tmp0 ^ in7; -        out0 = tmp1 ^ in3; -        out1 = tmp1 ^ tmp2; -        tmp3 = tmp2 ^ in7; -        tmp4 = tmp2 ^ in4 ^ in6; -        out5 = tmp3 ^ in0; -        out3 = tmp3 ^ out6; -        out4 = tmp4 ^ out5; -        out2 = tmp0 ^ tmp4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_34(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in4; -        tmp1 = in4 ^ in5; -        tmp2 = tmp0 ^ in1; -        tmp3 = tmp0 ^ in6; -        out1 = tmp1 ^ in7; -        tmp4 = tmp1 ^ in2; -        out5 = tmp2 ^ in0; -        out3 = tmp2 ^ out1; -        out0 = tmp3 ^ in7; -        out7 = tmp3 ^ tmp4; -        out6 = tmp4 ^ in1; -        out2 = out3 ^ out5 ^ in3; -        out4 = tmp4 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_35(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in6; -        tmp1 = in5 ^ in7; -        out7 = tmp0 ^ tmp1 ^ in3; -        out3 = tmp1 ^ in1; -        out1 = out3 ^ in4; -        tmp2 = out1 ^ in7; -        out5 = tmp2 ^ in0 ^ in3; -        out6 = tmp0 ^ tmp2; -        out0 = out3 ^ out5 ^ in6; -        out4 = tmp0 ^ out0; -        out2 = out4 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_36(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in0 ^ in2; -        tmp0 = in1 ^ in3; -        out0 = in3 ^ in4 ^ in6; -        out6 = in1 ^ in2 ^ in4; -        out5 = tmp0 ^ in0; -        tmp1 = out5 ^ in5; -        out2 = tmp1 ^ in4; -        out3 = tmp1 ^ out4; -        out1 = tmp0 ^ out2 ^ in7; -        out7 = out3 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_37(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = in2 ^ in4; -        tmp2 = tmp0 ^ in6; -        out3 = tmp0 ^ in5; -        out4 = tmp1 ^ in0; -        out6 = tmp2 ^ in4; -        out1 = out3 ^ out4 ^ in7; -        tmp3 = out4 ^ in1 ^ in3; -        out7 = tmp3 ^ out1; -        out2 = tmp3 ^ in5; -        out5 = tmp1 ^ out2; -        out0 = tmp2 ^ tmp3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_38(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0 ^ in3; -        tmp0 = in3 ^ in4; -        tmp1 = in5 ^ in7; -        tmp2 = out3 ^ in1; -        out2 = tmp0 ^ in6; -        out0 = tmp0 ^ tmp1; -        out4 = tmp1 ^ tmp2; -        out7 = out2 ^ in2; -        out1 = out2 ^ in3 ^ in5; -        out6 = out4 ^ in0 ^ in2; -        out5 = tmp2 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_39(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0; -        tmp0 = in1 ^ in5; -        tmp1 = tmp0 ^ in4; -        out1 = tmp1 ^ in6; -        out5 = out1 ^ in0 ^ in2; -        tmp2 = tmp0 ^ out5; -        out2 = tmp2 ^ in0 ^ in3; -        out7 = out2 ^ in7; -        out6 = tmp1 ^ out7; -        out4 = tmp2 ^ out6; -        out0 = out4 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_3A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in0 ^ in2; -        tmp2 = in3 ^ in4; -        tmp3 = in1 ^ in6; -        tmp4 = in3 ^ in7; -        out4 = tmp0 ^ in5; -        out5 = tmp1 ^ tmp3; -        out3 = tmp1 ^ tmp4; -        out0 = tmp2 ^ in5; -        out7 = tmp2 ^ in2; -        tmp5 = tmp3 ^ in4; -        out2 = tmp4 ^ tmp5; -        out1 = tmp5 ^ out4; -        out6 = tmp0 ^ out3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_3B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in6; -        tmp1 = in2 ^ in7; -        tmp2 = tmp0 ^ in3; -        out3 = tmp1 ^ in0; -        out6 = tmp1 ^ tmp2; -        out2 = out6 ^ in4; -        out7 = tmp0 ^ out2; -        out0 = out3 ^ out7 ^ in5; -        out5 = out0 ^ out2 ^ in7; -        out1 = tmp2 ^ out0; -        out4 = out1 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_3C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = in2 ^ in7; -        tmp2 = in1 ^ in6 ^ in7; -        out2 = tmp0 ^ in4; -        out3 = tmp0 ^ tmp2; -        out4 = tmp1 ^ out3 ^ in5; -        out5 = tmp2 ^ out2 ^ in2; -        out1 = out4 ^ out5 ^ in6; -        out0 = out1 ^ in3; -        out7 = tmp1 ^ out0; -        out6 = tmp2 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_3D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in2; -        tmp1 = tmp0 ^ in3; -        out2 = tmp1 ^ in4; -        tmp2 = out2 ^ in5; -        out4 = tmp2 ^ in1 ^ in6; -        out5 = out4 ^ in7; -        out6 = out5 ^ in0; -        out7 = out6 ^ in1; -        out0 = tmp0 ^ out7; -        out1 = tmp1 ^ out5; -        out3 = tmp2 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_3E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in5; -        tmp1 = tmp0 ^ in4; -        out0 = tmp1 ^ in6; -        out7 = tmp1 ^ in2; -        out6 = out7 ^ in1 ^ in5 ^ in7; -        out2 = out6 ^ in0 ^ in2; -        out4 = out0 ^ out6 ^ in0; -        out5 = tmp0 ^ out4; -        out3 = out5 ^ in7; -        out1 = out3 ^ out6 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_3F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        out3 = tmp0 ^ in2 ^ in6; -        tmp1 = out3 ^ in5 ^ in7; -        out4 = tmp1 ^ in4; -        out5 = tmp1 ^ in3; -        out1 = out4 ^ in2; -        out7 = out1 ^ out3 ^ in3; -        out2 = tmp0 ^ out7 ^ in5; -        tmp2 = out2 ^ in0; -        out6 = tmp2 ^ in6; -        out0 = tmp1 ^ tmp2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_40(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in3 ^ in7; -        tmp0 = in3 ^ in4; -        tmp1 = in6 ^ in7; -        out4 = tmp0 ^ in2; -        out5 = tmp0 ^ in5; -        out0 = tmp1 ^ in2; -        out7 = tmp1 ^ in1 ^ in5; -        out2 = out0 ^ in4; -        out3 = out2 ^ out5 ^ in7; -        out6 = out3 ^ out4 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_41(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in2 ^ in3; -        tmp0 = in5 ^ in6; -        tmp1 = in6 ^ in7; -        out5 = in3 ^ in4; -        out1 = in1 ^ in3 ^ in7; -        out6 = in0 ^ in4 ^ in5; -        out3 = tmp0 ^ in2; -        out7 = tmp0 ^ in1; -        out2 = tmp1 ^ in4; -        out0 = tmp1 ^ in0 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_42(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in2 ^ in6; -        out5 = in3 ^ in5; -        out1 = in0 ^ in3 ^ in7; -        out7 = in1 ^ in5 ^ in7; -        out4 = in2 ^ in4 ^ in7; -        out6 = in0 ^ in4 ^ in6; -        out2 = out0 ^ in1 ^ in4; -        out3 = out5 ^ in6 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_43(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in3; -        out7 = in1 ^ in5; -        out4 = in2 ^ in7; -        out6 = in0 ^ in4; -        out0 = in0 ^ in2 ^ in6; -        out3 = in5 ^ in6 ^ in7; -        out2 = in1 ^ in4 ^ in6; -        out1 = in0 ^ in1 ^ in3 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_44(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in3; -        out0 = in2 ^ in7; -        tmp0 = in4 ^ in7; -        out7 = in1 ^ in6 ^ in7; -        out6 = in0 ^ in5 ^ in6; -        out4 = tmp0 ^ in3 ^ in6; -        out3 = out0 ^ in1 ^ in3 ^ in5; -        out2 = out0 ^ in0 ^ in4; -        out5 = tmp0 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_45(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in1 ^ in3; -        out7 = in1 ^ in6; -        out5 = in4 ^ in7; -        out6 = in0 ^ in5; -        out0 = in0 ^ in2 ^ in7; -        out4 = in3 ^ in6 ^ in7; -        out2 = out5 ^ in0; -        out3 = out0 ^ out6 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_46(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in2; -        out1 = in0 ^ in3; -        out7 = in1 ^ in7; -        out4 = in4 ^ in6; -        out5 = in5 ^ in7; -        out6 = in0 ^ in6; -        out3 = in1 ^ in3 ^ in5; -        out2 = out4 ^ out6 ^ in1 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_47(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in6; -        out7 = in1; -        out5 = in7; -        out6 = in0; -        tmp0 = in0 ^ in1; -        out3 = in1 ^ in5; -        out0 = in0 ^ in2; -        out1 = tmp0 ^ in3; -        out2 = tmp0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_48(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        out1 = in3 ^ in6 ^ in7; -        out3 = tmp0 ^ in0; -        out0 = tmp0 ^ out1 ^ in5; -        tmp1 = out0 ^ in4; -        out2 = tmp1 ^ in7; -        out5 = tmp1 ^ in3; -        out4 = out5 ^ in1; -        out7 = tmp0 ^ out4; -        out6 = tmp1 ^ out3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_49(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0 ^ in2; -        tmp0 = in2 ^ in5; -        out2 = in4 ^ in5 ^ in6; -        tmp1 = tmp0 ^ out2 ^ in3; -        out7 = out2 ^ in1; -        out5 = tmp1 ^ in7; -        out4 = out5 ^ out7 ^ in6; -        out1 = tmp0 ^ out4; -        out6 = out1 ^ out7 ^ in0; -        out0 = tmp1 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_4A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in6; -        tmp1 = in3 ^ in7; -        out0 = tmp0 ^ in5; -        out3 = tmp1 ^ in0; -        out5 = tmp1 ^ out0; -        out4 = out0 ^ in1 ^ in4; -        out1 = out3 ^ in6; -        out2 = out4 ^ in7; -        out6 = out1 ^ in4; -        out7 = tmp0 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_4B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0 ^ in7; -        tmp0 = in1 ^ in5; -        tmp1 = in2 ^ in6; -        tmp2 = out3 ^ in3; -        out7 = tmp0 ^ in4; -        out4 = tmp0 ^ tmp1; -        tmp3 = tmp1 ^ in0; -        out6 = tmp2 ^ in4; -        out5 = tmp2 ^ tmp3; -        out1 = tmp2 ^ in1 ^ in6; -        out2 = out7 ^ in6 ^ in7; -        out0 = tmp3 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_4C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in3 ^ in6; -        tmp0 = in2 ^ in5; -        tmp1 = out1 ^ in5 ^ in7; -        out0 = tmp0 ^ in7; -        tmp2 = tmp0 ^ in4; -        out6 = tmp1 ^ in0; -        out2 = tmp2 ^ in0; -        out5 = tmp2 ^ in6; -        out3 = tmp0 ^ out6 ^ in1; -        out7 = out0 ^ out5 ^ in1; -        out4 = tmp1 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_4D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in5; -        tmp1 = in1 ^ in6; -        out4 = in1 ^ in3 ^ in5; -        tmp2 = tmp0 ^ in7; -        out2 = tmp0 ^ in4; -        out1 = tmp1 ^ in3; -        out7 = tmp1 ^ in4; -        out0 = tmp2 ^ in2; -        out6 = tmp2 ^ in3; -        out5 = out7 ^ in1 ^ in2; -        out3 = tmp1 ^ out0 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_4E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in2 ^ in5; -        out7 = in1 ^ in4 ^ in7; -        out1 = in0 ^ in3 ^ in6; -        out5 = out0 ^ in6; -        out4 = out7 ^ in5; -        out3 = out1 ^ in1; -        out6 = out1 ^ in7; -        out2 = out4 ^ in0 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_4F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in2 ^ in6; -        out7 = in1 ^ in4; -        out3 = in0 ^ in1 ^ in6; -        out4 = in1 ^ in5 ^ in7; -        out0 = in0 ^ in2 ^ in5; -        out6 = in0 ^ in3 ^ in7; -        out1 = out3 ^ in3; -        out2 = out4 ^ in0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_50(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in2 ^ in7; -        tmp0 = in3 ^ in5; -        out0 = out2 ^ in4 ^ in6; -        out1 = tmp0 ^ in7; -        tmp1 = tmp0 ^ in6; -        out3 = out0 ^ in3; -        out7 = tmp1 ^ in1; -        tmp2 = tmp1 ^ in0; -        out5 = out3 ^ in1 ^ in2; -        out4 = tmp2 ^ in2; -        out6 = tmp2 ^ out3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_51(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in7; -        out3 = in2 ^ in4 ^ in6 ^ in7; -        out0 = out3 ^ in0; -        out6 = out0 ^ in5; -        out4 = out6 ^ in3 ^ in7; -        out1 = out0 ^ out4 ^ in1; -        out7 = out1 ^ in6; -        out5 = out7 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_52(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in1 ^ in2; -        tmp0 = in2 ^ in4; -        tmp1 = in3 ^ in5; -        tmp2 = in3 ^ in6; -        tmp3 = in0 ^ in7; -        out0 = tmp0 ^ in6; -        out6 = tmp0 ^ tmp3; -        out7 = tmp1 ^ in1; -        out1 = tmp1 ^ tmp3; -        out3 = tmp2 ^ in4; -        out5 = tmp2 ^ in1 ^ in7; -        out4 = tmp2 ^ out1 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_53(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in1; -        out3 = in4 ^ in6; -        out0 = out3 ^ in0 ^ in2; -        out6 = out0 ^ in7; -        out4 = out6 ^ in5; -        out7 = out0 ^ out4 ^ in1 ^ in3; -        out1 = out7 ^ in0; -        out5 = out7 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_54(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in3 ^ in5; -        tmp0 = in1 ^ in3; -        tmp1 = in2 ^ in4; -        tmp2 = in0 ^ in7; -        out5 = in1 ^ in4 ^ in6; -        out4 = tmp2 ^ out1; -        out7 = tmp0 ^ in6; -        out3 = tmp0 ^ tmp1; -        out0 = tmp1 ^ in7; -        tmp3 = tmp2 ^ in2; -        out2 = tmp3 ^ in6; -        out6 = tmp3 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_55(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in3; -        tmp1 = in1 ^ in4; -        tmp2 = in6 ^ in7; -        out7 = tmp0 ^ tmp2; -        out1 = tmp0 ^ in5; -        out3 = tmp1 ^ in2; -        out5 = tmp1 ^ in5 ^ in6; -        out2 = tmp2 ^ in0; -        out4 = out5 ^ out7 ^ in0; -        out6 = out2 ^ in2 ^ in5; -        out0 = out5 ^ out6 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_56(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in2 ^ in4; -        tmp0 = in0 ^ in2; -        out4 = in0 ^ in5; -        out7 = in1 ^ in3; -        out5 = in1 ^ in6; -        out6 = tmp0 ^ in7; -        out2 = tmp0 ^ out5; -        out1 = out4 ^ in3; -        out3 = out7 ^ in4 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_57(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in5; -        tmp1 = in1 ^ in7; -        out0 = in0 ^ in2 ^ in4; -        out5 = in1 ^ in5 ^ in6; -        out4 = tmp0 ^ in4; -        out1 = tmp0 ^ in1 ^ in3; -        out2 = tmp0 ^ out5; -        out3 = tmp1 ^ in4; -        out7 = tmp1 ^ in3; -        out6 = tmp1 ^ out2 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_58(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in2 ^ in5; -        tmp0 = in2 ^ in3 ^ in4; -        out5 = tmp0 ^ in1; -        out6 = tmp0 ^ in0 ^ in5; -        out3 = out6 ^ in7; -        tmp1 = out2 ^ out5; -        out7 = tmp1 ^ in6; -        out4 = tmp1 ^ out3 ^ in3; -        out0 = out4 ^ out7 ^ in0; -        out1 = tmp0 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_59(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in5; -        tmp0 = in0 ^ in5 ^ in7; -        out3 = tmp0 ^ in2 ^ in4; -        out0 = out3 ^ in6; -        tmp1 = out0 ^ in7; -        out6 = tmp1 ^ in3; -        out5 = out6 ^ in0 ^ in1 ^ in6; -        out4 = tmp0 ^ out5; -        out1 = tmp1 ^ out4; -        out7 = out1 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_5A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = in2 ^ in5; -        out5 = tmp0 ^ in3; -        out4 = tmp0 ^ in0; -        tmp2 = tmp1 ^ in4; -        out2 = tmp1 ^ in1 ^ in7; -        out7 = tmp2 ^ out5; -        out6 = out4 ^ out7 ^ in5; -        out0 = tmp2 ^ in6; -        out1 = out0 ^ out6 ^ in7; -        out3 = tmp1 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_5B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        tmp1 = in0 ^ in4; -        tmp2 = in1 ^ in5; -        out5 = tmp0 ^ tmp2; -        tmp3 = tmp1 ^ in6; -        out3 = tmp1 ^ in5; -        out2 = tmp2 ^ in7; -        tmp4 = out3 ^ in2; -        out7 = out2 ^ in3 ^ in4; -        out0 = tmp4 ^ in6; -        out6 = tmp0 ^ tmp3; -        out4 = tmp2 ^ tmp4; -        out1 = tmp3 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_5C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in6; -        tmp1 = in0 ^ in2 ^ in5; -        out1 = tmp0 ^ in5; -        tmp2 = tmp0 ^ in1; -        out2 = tmp1 ^ in6; -        out6 = tmp1 ^ in3; -        out4 = tmp2 ^ in0; -        out7 = tmp2 ^ in4; -        out3 = tmp1 ^ out7; -        out0 = out3 ^ out4 ^ in7; -        out5 = out0 ^ in1 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_5D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in0 ^ in6; -        out2 = tmp1 ^ in5; -        tmp2 = out2 ^ in3; -        out6 = tmp2 ^ in2; -        out1 = tmp0 ^ tmp2; -        tmp3 = out1 ^ in4 ^ in5; -        out4 = tmp3 ^ in0; -        out7 = tmp3 ^ in7; -        tmp4 = out4 ^ out6; -        out5 = tmp4 ^ in7; -        out0 = tmp0 ^ out5; -        out3 = tmp1 ^ tmp4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_5E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        tmp1 = in3 ^ in5; -        tmp2 = in1 ^ in7; -        out7 = in1 ^ in3 ^ in4; -        out0 = tmp0 ^ in4; -        tmp3 = tmp1 ^ in0; -        out5 = tmp2 ^ in2; -        out1 = tmp3 ^ in6; -        out6 = tmp0 ^ tmp3; -        tmp4 = tmp2 ^ out1; -        out3 = tmp4 ^ in4; -        out4 = tmp1 ^ tmp4; -        out2 = tmp0 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_5F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in5; -        tmp1 = in0 ^ in6; -        tmp2 = tmp0 ^ in7; -        tmp3 = tmp1 ^ in3; -        out2 = tmp1 ^ tmp2; -        out5 = tmp2 ^ in2; -        out6 = tmp3 ^ in2; -        out3 = out2 ^ in4; -        out4 = out3 ^ in5; -        out1 = tmp0 ^ tmp3; -        out7 = tmp3 ^ out4; -        out0 = out4 ^ out5 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_60(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in2 ^ in5; -        tmp0 = in3 ^ in6; -        out1 = in3 ^ in4 ^ in7; -        out7 = out4 ^ in1; -        tmp1 = out4 ^ in4; -        out0 = tmp0 ^ in2; -        out5 = tmp0 ^ in0; -        out2 = tmp0 ^ tmp1; -        out3 = tmp1 ^ in7; -        out6 = out3 ^ out7 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_61(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        out4 = tmp0 ^ in4; -        tmp1 = out4 ^ in3; -        out3 = tmp1 ^ in7; -        out2 = tmp1 ^ in2 ^ in6; -        out1 = tmp0 ^ out3 ^ in1; -        out0 = out2 ^ out4 ^ in0; -        out7 = tmp1 ^ out1; -        out6 = out0 ^ out1 ^ in2; -        out5 = tmp0 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_62(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in4 ^ in5; -        tmp0 = in0 ^ in3 ^ in4; -        out1 = tmp0 ^ in7; -        out5 = tmp0 ^ in6; -        tmp1 = out1 ^ in0; -        tmp2 = tmp1 ^ out3; -        out4 = tmp2 ^ in2; -        tmp3 = tmp2 ^ in1; -        out0 = out4 ^ in5 ^ in6; -        out7 = tmp3 ^ out0; -        out6 = tmp0 ^ tmp3; -        out2 = tmp1 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_63(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in4; -        tmp1 = in1 ^ in7; -        out3 = tmp0 ^ in5; -        tmp2 = out3 ^ in6; -        out4 = out3 ^ in2 ^ in7; -        out5 = tmp2 ^ in0; -        tmp3 = out5 ^ in3; -        out0 = tmp3 ^ out4; -        out2 = tmp1 ^ tmp2; -        out6 = tmp1 ^ tmp3; -        tmp4 = tmp0 ^ out2; -        out1 = tmp4 ^ out5; -        out7 = tmp4 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_64(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in2 ^ in3; -        out1 = in3 ^ in4; -        out7 = in1 ^ in2; -        tmp0 = in4 ^ in5; -        tmp1 = in0 ^ in7; -        out4 = in5 ^ in6 ^ in7; -        out2 = tmp0 ^ out0 ^ in0; -        out3 = tmp0 ^ out7 ^ in6; -        out5 = tmp1 ^ in6; -        out6 = tmp1 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_65(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = in4 ^ in5; -        tmp2 = in6 ^ in7; -        out7 = in1 ^ in2 ^ in7; -        out1 = in1 ^ in3 ^ in4; -        out0 = tmp0 ^ in2; -        out2 = tmp0 ^ tmp1; -        out4 = tmp1 ^ tmp2; -        tmp3 = tmp2 ^ in0; -        out3 = out4 ^ out7 ^ in3; -        out5 = tmp3 ^ in5; -        out6 = tmp3 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_66(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = in2 ^ in3; -        tmp2 = in0 ^ in4; -        out7 = tmp0 ^ in6; -        out0 = tmp1 ^ in7; -        out1 = tmp2 ^ in3; -        tmp3 = tmp2 ^ in6; -        tmp4 = out1 ^ in5; -        out5 = tmp3 ^ in7; -        out4 = tmp3 ^ tmp4; -        out2 = tmp0 ^ tmp4 ^ in7; -        out6 = tmp1 ^ out2 ^ in4; -        out3 = tmp3 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_67(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = tmp0 ^ in1; -        tmp2 = tmp0 ^ in7; -        out1 = tmp1 ^ in4; -        out0 = tmp2 ^ in2; -        tmp3 = out1 ^ in7; -        out2 = tmp3 ^ in5; -        out3 = out2 ^ in0 ^ in6; -        out7 = tmp1 ^ out0 ^ in6; -        out5 = tmp1 ^ out3; -        out4 = tmp2 ^ out5; -        out6 = tmp3 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_68(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in4; -        tmp1 = in2 ^ in3 ^ in5; -        tmp2 = tmp0 ^ in1; -        tmp3 = tmp0 ^ in6; -        out0 = tmp1 ^ in6; -        out6 = tmp2 ^ in0; -        out7 = tmp1 ^ tmp2; -        out1 = tmp3 ^ in7; -        out2 = out1 ^ in2; -        out4 = tmp2 ^ out2; -        out3 = out4 ^ out6 ^ in3; -        out5 = tmp3 ^ out3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_69(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in6 ^ in7; -        out2 = tmp0 ^ in3 ^ in4; -        out1 = out2 ^ in1; -        out3 = out2 ^ in0 ^ in2; -        out4 = out1 ^ in2 ^ in3; -        out6 = out1 ^ in0 ^ in7; -        out7 = out4 ^ in5 ^ in6; -        out5 = out4 ^ out6 ^ in5; -        out0 = tmp0 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_6A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in6; -        out3 = in0 ^ in4 ^ in6; -        tmp1 = tmp0 ^ in3; -        out4 = tmp1 ^ in1; -        tmp2 = tmp1 ^ in7; -        out2 = out4 ^ in4; -        out0 = tmp2 ^ in5; -        out5 = tmp2 ^ out3; -        out7 = out2 ^ in3 ^ in5; -        out1 = tmp0 ^ out5; -        out6 = tmp1 ^ out7 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_6B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in6; -        out2 = tmp0 ^ in1 ^ in3; -        out4 = out2 ^ in2; -        tmp1 = out2 ^ in0; -        out7 = out4 ^ in3 ^ in5 ^ in7; -        out1 = tmp1 ^ in7; -        out3 = tmp1 ^ in1; -        out6 = tmp1 ^ in5; -        out0 = tmp1 ^ out7 ^ in6; -        out5 = tmp0 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_6C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in1; -        tmp0 = in2 ^ in3; -        out5 = in0 ^ in2; -        out1 = in3 ^ in4 ^ in6; -        tmp1 = out5 ^ in1; -        out0 = tmp0 ^ in5; -        out6 = tmp0 ^ tmp1; -        out3 = tmp1 ^ in4; -        out7 = out3 ^ in0; -        out2 = out6 ^ out7 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_6D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in1 ^ in4; -        tmp0 = in0 ^ in2; -        tmp1 = out4 ^ in3; -        out7 = out4 ^ in2 ^ in7; -        out5 = tmp0 ^ in5; -        out3 = tmp0 ^ tmp1; -        out1 = tmp1 ^ in6; -        out0 = out5 ^ in3; -        out2 = out3 ^ out7 ^ in4; -        out6 = out1 ^ in0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_6E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in3; -        tmp1 = in0 ^ in4; -        out4 = tmp0 ^ in7; -        out6 = tmp0 ^ in0 ^ in5; -        out5 = tmp1 ^ in2; -        tmp2 = tmp1 ^ in3; -        out3 = tmp2 ^ out4; -        out1 = tmp2 ^ in6; -        out2 = tmp0 ^ out5; -        out0 = out2 ^ out3 ^ in5; -        out7 = out1 ^ out2 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_6F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in7; -        tmp1 = tmp0 ^ in4; -        tmp2 = tmp0 ^ in0 ^ in2; -        out4 = tmp1 ^ in1; -        out0 = tmp2 ^ in5; -        out3 = out4 ^ in0; -        out2 = out3 ^ in7; -        out1 = out2 ^ in6; -        out6 = out1 ^ in4 ^ in5; -        out7 = tmp2 ^ out1; -        out5 = tmp1 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_70(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in2; -        tmp0 = in2 ^ in4; -        out2 = in2 ^ in3 ^ in5; -        tmp1 = tmp0 ^ in6; -        tmp2 = out2 ^ in7; -        out0 = tmp1 ^ in3; -        out4 = tmp1 ^ in0; -        out7 = tmp2 ^ in1; -        out6 = out4 ^ in1; -        out5 = out7 ^ in0 ^ in2; -        out1 = tmp0 ^ tmp2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_71(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in3 ^ in5; -        out3 = in2 ^ in3; -        tmp0 = in0 ^ in2; -        tmp1 = out2 ^ in1; -        out4 = tmp0 ^ in6; -        tmp2 = tmp0 ^ in1; -        out7 = tmp1 ^ in2; -        out1 = tmp1 ^ in4 ^ in7; -        out0 = out4 ^ in3 ^ in4; -        out6 = tmp2 ^ in4; -        out5 = tmp2 ^ out3 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_72(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in7; -        tmp0 = in0 ^ in4; -        tmp1 = tmp0 ^ in3 ^ in7; -        out1 = tmp1 ^ in5; -        out5 = out1 ^ in1; -        tmp2 = tmp0 ^ out5; -        out2 = tmp2 ^ in2; -        out7 = out2 ^ in6; -        out6 = tmp1 ^ out7; -        out4 = tmp2 ^ out6; -        out0 = out4 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_73(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in3 ^ in7; -        out2 = out3 ^ in1 ^ in5; -        out1 = out2 ^ in0 ^ in4; -        out5 = out1 ^ in5; -        out6 = out1 ^ out3 ^ in2; -        out0 = out2 ^ out6 ^ in6; -        out7 = out0 ^ out1 ^ in3; -        out4 = out0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_74(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in4; -        tmp1 = in1 ^ in2 ^ in6; -        out4 = in0 ^ in4 ^ in7; -        out5 = in0 ^ in1 ^ in5; -        out0 = tmp0 ^ in2; -        out1 = tmp0 ^ in5; -        out3 = tmp1 ^ in7; -        out6 = tmp1 ^ in0; -        out2 = tmp1 ^ out5 ^ in3; -        out7 = out3 ^ in3 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_75(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in0 ^ in7; -        tmp0 = in1 ^ in3; -        out5 = in0 ^ in1; -        out7 = tmp0 ^ in2; -        tmp1 = tmp0 ^ in4; -        out6 = out5 ^ in2; -        tmp2 = out7 ^ in6; -        out1 = tmp1 ^ in5; -        out0 = tmp1 ^ out6; -        out3 = tmp2 ^ in7; -        out2 = tmp2 ^ out6 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_76(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in1 ^ in6; -        tmp0 = in0 ^ in5; -        tmp1 = in3 ^ in7; -        tmp2 = tmp0 ^ in4; -        tmp3 = tmp1 ^ in2; -        out5 = tmp2 ^ in1; -        out1 = tmp2 ^ in3; -        out0 = tmp3 ^ in4; -        out4 = out1 ^ in5; -        out7 = tmp3 ^ out3; -        out2 = tmp0 ^ out7; -        out6 = tmp1 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_77(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in0 ^ in3; -        tmp0 = in1 ^ in4; -        tmp1 = in1 ^ in6; -        tmp2 = out4 ^ in5; -        out5 = tmp0 ^ in0; -        out1 = tmp0 ^ tmp2; -        out3 = tmp1 ^ in3; -        out2 = tmp1 ^ tmp2 ^ in7; -        out7 = out3 ^ in2; -        tmp3 = out7 ^ in6; -        out6 = tmp2 ^ tmp3; -        out0 = tmp3 ^ out5 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_78(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = in2 ^ in7; -        tmp2 = in0 ^ in5 ^ in6; -        out2 = tmp1 ^ in3; -        out3 = tmp2 ^ in2; -        out5 = out3 ^ in1 ^ in3; -        out0 = tmp0 ^ out3 ^ in4; -        out1 = tmp1 ^ out0; -        out4 = out1 ^ out5 ^ in5; -        out7 = tmp0 ^ out4; -        out6 = tmp2 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_79(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in3 ^ in7; -        tmp0 = in3 ^ in4; -        tmp1 = in1 ^ in5; -        tmp2 = tmp1 ^ in2; -        out4 = tmp2 ^ in0 ^ in7; -        tmp3 = out4 ^ in5; -        out5 = tmp3 ^ out2 ^ in6; -        out7 = tmp0 ^ tmp2; -        out6 = tmp0 ^ tmp3; -        out3 = tmp1 ^ out5; -        out0 = out3 ^ in4; -        out1 = tmp3 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_7A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        out2 = tmp0 ^ in3; -        tmp1 = out2 ^ in4; -        out4 = tmp1 ^ in0 ^ in5; -        out5 = out4 ^ in6; -        out6 = out5 ^ in7; -        out7 = out6 ^ in0; -        out0 = out7 ^ in1; -        out1 = tmp0 ^ out6; -        out3 = tmp1 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_7B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in1 ^ in3; -        tmp0 = in0 ^ in5; -        out4 = tmp0 ^ out2 ^ in2; -        tmp1 = out4 ^ in4; -        out6 = tmp1 ^ in7; -        out5 = tmp1 ^ in5 ^ in6; -        out0 = out6 ^ in1 ^ in6; -        tmp2 = out0 ^ in2; -        out1 = tmp2 ^ in1; -        out3 = tmp2 ^ in4; -        out7 = tmp0 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_7C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in5; -        tmp1 = tmp0 ^ in4; -        out0 = tmp1 ^ in2; -        out1 = tmp1 ^ in6; -        out7 = out0 ^ in1 ^ in5 ^ in7; -        out5 = out1 ^ out7 ^ in0; -        out3 = out5 ^ in6; -        out6 = tmp0 ^ out5; -        out2 = out6 ^ in1; -        out4 = out2 ^ out7 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_7D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = tmp0 ^ in3; -        tmp2 = tmp0 ^ in6; -        out7 = tmp1 ^ in4; -        tmp3 = tmp2 ^ in0; -        out5 = tmp3 ^ in7; -        out4 = tmp3 ^ in2 ^ in5; -        out2 = tmp1 ^ out5; -        out6 = tmp2 ^ out2; -        out0 = out4 ^ out7 ^ in6; -        out1 = tmp3 ^ out0; -        out3 = out6 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_7E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in4; -        tmp1 = in0 ^ in5; -        out1 = tmp0 ^ tmp1 ^ in6; -        out3 = tmp1 ^ in1; -        out4 = out1 ^ in1 ^ in7; -        tmp2 = out4 ^ in3; -        out5 = tmp2 ^ in2; -        out6 = tmp0 ^ out5; -        out7 = tmp1 ^ out4 ^ in2; -        out2 = out6 ^ in5 ^ in7; -        out0 = tmp2 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_7F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in7; -        tmp1 = tmp0 ^ in3 ^ in5; -        tmp2 = tmp1 ^ in0; -        out0 = tmp2 ^ in4; -        out6 = tmp2 ^ in1; -        out3 = tmp0 ^ out6; -        tmp3 = out3 ^ in6; -        out1 = tmp3 ^ in4; -        out2 = tmp3 ^ in5; -        out4 = tmp3 ^ in7; -        out5 = tmp1 ^ out1; -        out7 = out0 ^ out4 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_80(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        tmp1 = in4 ^ in5; -        out1 = in2 ^ in6 ^ in7; -        out5 = tmp0 ^ in4; -        tmp2 = tmp0 ^ in1; -        out6 = tmp1 ^ in3; -        out7 = tmp1 ^ in0 ^ in6; -        out4 = tmp2 ^ in7; -        out3 = tmp2 ^ out6; -        out2 = out3 ^ out5 ^ in6; -        out0 = out2 ^ in3 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_81(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in6; -        tmp1 = tmp0 ^ in3; -        out6 = tmp1 ^ in5; -        out5 = out6 ^ in2 ^ in6; -        out3 = out5 ^ in1; -        out2 = tmp0 ^ out3; -        out1 = out3 ^ out6 ^ in7; -        out4 = tmp1 ^ out1; -        out7 = out2 ^ out4 ^ in0; -        out0 = out7 ^ in1 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_82(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in1 ^ in2; -        tmp0 = in6 ^ in7; -        out5 = in2 ^ in3; -        out6 = in3 ^ in4; -        out7 = in0 ^ in4 ^ in5; -        out0 = in1 ^ in5 ^ in6; -        out1 = tmp0 ^ in0 ^ in2; -        out2 = tmp0 ^ in3 ^ in5; -        out3 = tmp0 ^ out0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_83(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in2 ^ in5; -        tmp2 = in3 ^ in6; -        out4 = in1 ^ in2 ^ in4; -        out0 = tmp0 ^ in5 ^ in6; -        out5 = tmp1 ^ in3; -        tmp3 = tmp1 ^ in7; -        out6 = tmp2 ^ in4; -        out2 = tmp2 ^ tmp3; -        tmp4 = tmp3 ^ out4; -        out1 = tmp3 ^ out0; -        out3 = tmp4 ^ in3; -        out7 = tmp0 ^ tmp4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_84(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in2 ^ in6; -        out6 = in3 ^ in5; -        out0 = in1 ^ in5 ^ in7; -        out7 = in0 ^ in4 ^ in6; -        out4 = in1 ^ in3 ^ in6; -        out5 = in2 ^ in4 ^ in7; -        out2 = out6 ^ in0 ^ in1; -        out3 = out5 ^ in5 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_85(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in6; -        tmp1 = in3 ^ in6; -        tmp2 = tmp0 ^ in4; -        out1 = tmp0 ^ in2; -        out6 = tmp1 ^ in5; -        out4 = tmp2 ^ in3; -        tmp3 = out1 ^ out6; -        out2 = tmp3 ^ in0; -        out3 = tmp2 ^ tmp3 ^ in7; -        out7 = out2 ^ out3 ^ in1; -        out5 = tmp1 ^ out3; -        out0 = tmp2 ^ out7 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_86(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out6 = in3; -        out7 = in0 ^ in4; -        out0 = in1 ^ in5; -        out5 = in2 ^ in7; -        out3 = in4 ^ in5 ^ in6; -        out1 = in0 ^ in2 ^ in6; -        out4 = in1 ^ in6 ^ in7; -        out2 = in0 ^ in3 ^ in5 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_87(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out6 = in3 ^ in6; -        tmp0 = in0 ^ in1; -        out7 = in0 ^ in4 ^ in7; -        out5 = in2 ^ in5 ^ in7; -        out3 = out6 ^ in4 ^ in5; -        out0 = tmp0 ^ in5; -        tmp1 = tmp0 ^ in6; -        out2 = out5 ^ in0 ^ in3; -        out1 = tmp1 ^ in2; -        out4 = tmp1 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_88(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in2 ^ in7; -        tmp0 = in5 ^ in6; -        out0 = in1 ^ in6 ^ in7; -        out6 = in4 ^ in5 ^ in7; -        out3 = out0 ^ out1 ^ in0 ^ in4; -        out7 = tmp0 ^ in0; -        tmp1 = tmp0 ^ in3; -        out2 = out0 ^ in3; -        out4 = tmp1 ^ in2; -        out5 = tmp1 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_89(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in7; -        tmp1 = in2 ^ in7; -        tmp2 = tmp0 ^ in6; -        out1 = tmp1 ^ in1; -        out7 = tmp2 ^ in5; -        out0 = tmp2 ^ in1; -        out2 = out1 ^ in3 ^ in6; -        out6 = out7 ^ in0 ^ in4; -        out5 = out6 ^ in3; -        out3 = tmp0 ^ out2 ^ in4; -        out4 = tmp1 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_8A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in1 ^ in6; -        out7 = in0 ^ in5; -        out2 = in3 ^ in6; -        out6 = in4 ^ in7; -        out1 = in0 ^ in2 ^ in7; -        out3 = out0 ^ out6 ^ in0; -        out4 = out1 ^ out7 ^ in6; -        out5 = out2 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_8B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in3 ^ in6; -        tmp2 = in5 ^ in7; -        tmp3 = tmp0 ^ in7; -        out0 = tmp0 ^ in6; -        out2 = tmp1 ^ in2; -        out5 = tmp1 ^ tmp2; -        out7 = tmp2 ^ in0; -        tmp4 = tmp3 ^ in4; -        out1 = tmp3 ^ in2; -        out6 = tmp4 ^ out0; -        out4 = out6 ^ in2 ^ in5; -        out3 = tmp1 ^ tmp4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_8C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in2; -        out0 = in1 ^ in7; -        out7 = in0 ^ in6; -        out5 = in4 ^ in6; -        out6 = in5 ^ in7; -        out2 = out0 ^ in0 ^ in3; -        out3 = out5 ^ out7 ^ in2 ^ in7; -        out4 = out6 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_8D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in1 ^ in2; -        tmp0 = in6 ^ in7; -        out0 = in0 ^ in1 ^ in7; -        out5 = in4 ^ in5 ^ in6; -        out6 = tmp0 ^ in5; -        out7 = tmp0 ^ in0; -        out4 = tmp0 ^ out5 ^ in3; -        out2 = out0 ^ in2 ^ in3; -        out3 = out2 ^ in1 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_8E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in1; -        out4 = in5; -        out7 = in0; -        out5 = in6; -        out6 = in7; -        out3 = in0 ^ in4; -        out1 = in0 ^ in2; -        out2 = in0 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_8F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in0 ^ in1; -        tmp0 = in0 ^ in3; -        out4 = in4 ^ in5; -        out7 = in0 ^ in7; -        out5 = in5 ^ in6; -        out6 = in6 ^ in7; -        out1 = out0 ^ in2; -        out2 = tmp0 ^ in2; -        out3 = tmp0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_90(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = in2 ^ in6 ^ in7; -        out3 = tmp0 ^ in7; -        out1 = tmp1 ^ in5; -        tmp2 = out1 ^ in4; -        out6 = tmp2 ^ in3; -        out5 = out6 ^ in1; -        out4 = out5 ^ in0; -        out0 = tmp0 ^ tmp2; -        out7 = tmp0 ^ out4; -        out2 = tmp1 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_91(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in4; -        tmp1 = tmp0 ^ in3 ^ in5; -        out2 = tmp1 ^ in1; -        out6 = tmp1 ^ in7; -        tmp2 = out2 ^ in5 ^ in7; -        out3 = tmp2 ^ in4; -        out5 = tmp2 ^ in6; -        out1 = tmp1 ^ out5 ^ in2; -        tmp3 = out1 ^ in0; -        out4 = tmp3 ^ in3; -        out0 = tmp0 ^ tmp3; -        out7 = tmp2 ^ tmp3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_92(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in1; -        tmp0 = in4 ^ in5; -        tmp1 = tmp0 ^ in1; -        out2 = tmp0 ^ in3 ^ in7; -        out0 = tmp1 ^ in6; -        out7 = out2 ^ in0; -        out4 = out0 ^ in0 ^ in2; -        out5 = out4 ^ out7 ^ in5; -        out6 = tmp1 ^ out5; -        out1 = out6 ^ out7 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_93(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in1 ^ in3; -        tmp0 = in2 ^ in7; -        tmp1 = out3 ^ in6; -        tmp2 = tmp0 ^ in4; -        out5 = tmp0 ^ tmp1; -        out6 = tmp2 ^ in3; -        out2 = out6 ^ in5; -        out0 = out2 ^ out5 ^ in0; -        out7 = tmp1 ^ out0; -        out1 = tmp2 ^ out0; -        out4 = out1 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_94(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in2 ^ in6; -        tmp0 = in1 ^ in4 ^ in5; -        out1 = out3 ^ in5; -        out5 = tmp0 ^ out3; -        out0 = tmp0 ^ in7; -        out4 = tmp0 ^ in0 ^ in3; -        out6 = out1 ^ in3 ^ in7; -        out2 = out4 ^ in6; -        out7 = out0 ^ out2 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_95(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        out3 = tmp0 ^ in6; -        tmp1 = tmp0 ^ in7; -        tmp2 = out3 ^ in0; -        out6 = tmp1 ^ in5; -        tmp3 = tmp2 ^ in4; -        out7 = tmp3 ^ in2; -        tmp4 = tmp3 ^ in5; -        out2 = tmp4 ^ in1; -        tmp5 = out2 ^ in6; -        out0 = tmp1 ^ tmp5; -        out1 = tmp5 ^ out7; -        out4 = tmp2 ^ out1; -        out5 = tmp4 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_96(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in6 ^ in7; -        tmp0 = in1 ^ in5; -        tmp1 = in5 ^ in6; -        out6 = out3 ^ in2 ^ in3; -        out0 = tmp0 ^ in4; -        tmp2 = tmp1 ^ in2; -        out4 = out0 ^ in0 ^ in7; -        out1 = tmp2 ^ in0; -        out5 = tmp2 ^ in1; -        out7 = tmp0 ^ out4 ^ in3; -        out2 = tmp1 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_97(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in4; -        tmp1 = in2 ^ in6; -        out3 = in3 ^ in6 ^ in7; -        out7 = tmp0 ^ in3; -        tmp2 = tmp0 ^ in5; -        out5 = tmp1 ^ in1; -        out6 = tmp1 ^ out3; -        out0 = tmp2 ^ in1; -        out2 = tmp2 ^ out3 ^ in2; -        tmp3 = out0 ^ in4; -        out4 = tmp3 ^ in7; -        out1 = tmp1 ^ tmp3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_98(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in5 ^ in7; -        tmp1 = in1 ^ in4 ^ in7; -        out1 = tmp0 ^ in2; -        out0 = tmp1 ^ in6; -        out2 = tmp1 ^ in3; -        out6 = out0 ^ out1 ^ in1; -        out5 = tmp0 ^ out2; -        out3 = tmp1 ^ out6 ^ in0; -        out7 = out0 ^ out5 ^ in0; -        out4 = out6 ^ out7 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_99(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        out5 = in1 ^ in3 ^ in4; -        out6 = in2 ^ in4 ^ in5; -        out4 = tmp0 ^ in2; -        tmp1 = tmp0 ^ in6; -        tmp2 = out5 ^ in7; -        out7 = tmp1 ^ in5; -        out0 = tmp1 ^ tmp2; -        out2 = tmp2 ^ in2; -        out3 = out0 ^ out6 ^ in3; -        out1 = tmp1 ^ out3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_9A(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in3 ^ in4; -        tmp0 = in0 ^ in5; -        tmp1 = in1 ^ in6; -        out5 = in1 ^ in3 ^ in5; -        tmp2 = tmp0 ^ in7; -        out3 = tmp0 ^ tmp1; -        out0 = tmp1 ^ in4; -        out7 = tmp2 ^ in3; -        out1 = tmp2 ^ in2; -        out6 = out0 ^ in1 ^ in2; -        out4 = out1 ^ in4 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_9B(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in1 ^ in3; -        tmp0 = in3 ^ in5; -        out6 = in2 ^ in4; -        out4 = in0 ^ in2 ^ in7; -        out7 = tmp0 ^ in0; -        out2 = out6 ^ in3; -        out1 = out4 ^ in1 ^ in5; -        out3 = out7 ^ in1 ^ in6; -        out0 = tmp0 ^ out3 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_9C(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out1 = in2 ^ in5; -        tmp0 = in0 ^ in3 ^ in6; -        out3 = out1 ^ in0; -        out6 = out1 ^ in6; -        out7 = tmp0 ^ in7; -        out4 = out7 ^ in4; -        out2 = out4 ^ in1; -        out0 = tmp0 ^ out2; -        out5 = out0 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_9D(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out6 = in2 ^ in5; -        tmp0 = in0 ^ in3; -        out5 = in1 ^ in4 ^ in7; -        out1 = out6 ^ in1; -        out3 = tmp0 ^ out6; -        out7 = tmp0 ^ in6; -        out0 = out5 ^ in0; -        out4 = out7 ^ in7; -        out2 = out5 ^ out7 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_9E(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in1 ^ in4; -        tmp0 = in0 ^ in5; -        out6 = in2 ^ in6; -        out7 = in0 ^ in3 ^ in7; -        out4 = in0 ^ in4 ^ in6; -        out5 = in1 ^ in5 ^ in7; -        out1 = tmp0 ^ in2; -        out3 = tmp0 ^ in7; -        out2 = out4 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_9F(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out6 = in2; -        out7 = in0 ^ in3; -        tmp0 = in0 ^ in1; -        out4 = in0 ^ in6; -        out5 = in1 ^ in7; -        out1 = tmp0 ^ in2 ^ in5; -        out2 = out7 ^ in2 ^ in4 ^ in6; -        out3 = out7 ^ in5 ^ in7; -        out0 = tmp0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A0(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in6; -        out2 = tmp0 ^ in7; -        tmp1 = tmp0 ^ in5; -        out6 = out2 ^ in3 ^ in4; -        out0 = tmp1 ^ in3; -        tmp2 = out0 ^ in2; -        out3 = tmp2 ^ in7; -        tmp3 = tmp2 ^ in1; -        out5 = tmp3 ^ in0; -        out4 = tmp3 ^ out6; -        out7 = out5 ^ out6 ^ in1; -        out1 = tmp1 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A1(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        tmp1 = tmp0 ^ in1; -        tmp2 = tmp0 ^ in4; -        out4 = tmp1 ^ in7; -        out7 = tmp2 ^ in0; -        out6 = tmp2 ^ out4 ^ in3; -        out3 = out4 ^ in6; -        out2 = out3 ^ in5; -        out1 = out2 ^ in4; -        out5 = out1 ^ out6 ^ in0; -        out0 = tmp1 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A2(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in6; -        tmp0 = in1 ^ in3 ^ in5; -        out3 = tmp0 ^ in6; -        out4 = tmp0 ^ in2 ^ in4; -        out0 = out3 ^ in7; -        out6 = out0 ^ in4; -        out1 = out0 ^ out4 ^ in0; -        out7 = out1 ^ in5; -        out5 = out7 ^ in3 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A3(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in2 ^ in6; -        out3 = in1 ^ in5 ^ in6; -        tmp0 = out2 ^ in0; -        out4 = out2 ^ out3 ^ in3; -        tmp1 = tmp0 ^ in4; -        out0 = tmp0 ^ out4 ^ in7; -        out5 = tmp1 ^ in3; -        out7 = tmp1 ^ in5; -        out1 = tmp1 ^ in1 ^ in7; -        out6 = tmp1 ^ out0 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A4(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in3; -        tmp1 = in2 ^ in4; -        tmp2 = in2 ^ in5; -        tmp3 = in0 ^ in7; -        out0 = tmp0 ^ in5; -        out6 = tmp0 ^ in6 ^ in7; -        out1 = tmp1 ^ in6; -        out7 = tmp1 ^ tmp3; -        out3 = tmp2 ^ in3; -        tmp4 = tmp2 ^ out1; -        out2 = tmp3 ^ in1; -        out5 = tmp4 ^ out7; -        out4 = tmp4 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A5(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in2 ^ in5; -        tmp0 = in1 ^ in6; -        tmp1 = in0 ^ in1; -        tmp2 = in2 ^ in4; -        out6 = in1 ^ in3 ^ in7; -        out4 = tmp0 ^ in5; -        out1 = tmp0 ^ tmp2; -        out0 = tmp1 ^ in3 ^ in5; -        out2 = tmp1 ^ in2 ^ in7; -        out7 = tmp2 ^ in0; -        out5 = tmp0 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A6(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0; -        out3 = in3 ^ in5 ^ in7; -        out1 = in0 ^ in2 ^ in4 ^ in6; -        out0 = out3 ^ in1; -        out7 = out1 ^ in7; -        out6 = out0 ^ in6; -        out5 = out7 ^ in5; -        out4 = out6 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A7(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0 ^ in2; -        out3 = in5 ^ in7; -        out7 = out2 ^ in4 ^ in6; -        out6 = out3 ^ in1 ^ in3; -        out1 = out7 ^ in1; -        out5 = out7 ^ in7; -        out0 = out6 ^ in0; -        out4 = out6 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A8(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in4; -        tmp1 = in1 ^ in6; -        tmp2 = in0 ^ in2 ^ in7; -        out1 = tmp0 ^ in7; -        out4 = tmp0 ^ in6; -        out0 = tmp1 ^ in3; -        out2 = tmp1 ^ in5; -        out6 = tmp1 ^ in4; -        out7 = tmp2 ^ in5; -        out3 = tmp2 ^ out0 ^ in6; -        out5 = out7 ^ in2 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_A9(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in2 ^ in6; -        out6 = in1 ^ in4; -        out7 = in0 ^ in2 ^ in5; -        out5 = in0 ^ in3 ^ in7; -        out2 = out4 ^ in1 ^ in5; -        out1 = out6 ^ in2 ^ in7; -        out0 = out2 ^ out7 ^ in3; -        out3 = out1 ^ in0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_AA(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in2; -        tmp1 = in1 ^ in3; -        tmp2 = in6 ^ in7; -        out1 = tmp0 ^ in4 ^ in7; -        out3 = tmp1 ^ in0; -        out0 = tmp1 ^ tmp2; -        out2 = tmp2 ^ in5; -        out7 = tmp0 ^ out2; -        out6 = out1 ^ out7 ^ in1; -        out5 = out0 ^ out6 ^ in0; -        out4 = out5 ^ out7 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_AB(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0 ^ in1; -        tmp0 = in1 ^ in4; -        tmp1 = in0 ^ in7; -        out6 = tmp0 ^ in5; -        out1 = tmp0 ^ tmp1 ^ in2; -        out5 = tmp1 ^ in3 ^ in4; -        out0 = tmp0 ^ out5 ^ in6; -        out4 = out0 ^ out3 ^ in2; -        out2 = out4 ^ in3 ^ in5; -        out7 = tmp1 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_AC(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in1 ^ in3; -        out1 = in2 ^ in4; -        tmp0 = in0 ^ in2; -        out4 = in4 ^ in7; -        out5 = in0 ^ in5; -        out6 = in1 ^ in6; -        out7 = tmp0 ^ in7; -        out3 = tmp0 ^ in3 ^ in6; -        out2 = out5 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_AD(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in7; -        out5 = in0; -        out6 = in1; -        out7 = in0 ^ in2; -        out0 = in0 ^ in1 ^ in3; -        out2 = out7 ^ in1 ^ in5; -        out1 = in1 ^ in2 ^ in4; -        out3 = out7 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_AE(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in3 ^ in4; -        tmp0 = in0 ^ in4; -        tmp1 = in0 ^ in7; -        out0 = in1 ^ in3 ^ in7; -        out1 = tmp0 ^ in2; -        out5 = tmp0 ^ in5; -        tmp2 = tmp1 ^ in6; -        out2 = tmp1 ^ in5; -        out3 = tmp2 ^ in3; -        out7 = tmp2 ^ in2; -        out6 = tmp2 ^ out2 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_AF(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in3; -        tmp0 = in0 ^ in7; -        out5 = in0 ^ in4; -        out6 = in1 ^ in5; -        out7 = in0 ^ in2 ^ in6; -        out0 = tmp0 ^ in1 ^ in3; -        out3 = tmp0 ^ in6; -        out2 = tmp0 ^ in2 ^ in5; -        out1 = out5 ^ in1 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B0(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in4; -        tmp1 = in3 ^ in6; -        out2 = tmp0 ^ in7; -        tmp2 = tmp0 ^ tmp1; -        out0 = tmp2 ^ in5; -        out3 = tmp2 ^ in2; -        out6 = out3 ^ in6; -        tmp3 = out6 ^ in0 ^ in1; -        out7 = tmp3 ^ in5; -        out5 = tmp3 ^ out2; -        out1 = out0 ^ out5 ^ in0; -        out4 = tmp1 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B1(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in4; -        out2 = tmp0 ^ in2 ^ in7; -        tmp1 = out2 ^ in6; -        out1 = tmp1 ^ in5; -        out3 = tmp1 ^ in7; -        out4 = tmp1 ^ in0; -        out6 = out3 ^ in3; -        out0 = out6 ^ in0 ^ in2 ^ in5; -        out5 = tmp1 ^ out0 ^ in1; -        out7 = tmp0 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B2(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in4; -        tmp0 = in4 ^ in7; -        tmp1 = in1 ^ in3 ^ in6; -        out3 = tmp0 ^ tmp1; -        tmp2 = tmp1 ^ in0; -        out0 = out3 ^ in5; -        out4 = tmp2 ^ in2; -        tmp3 = out4 ^ in6; -        out5 = tmp0 ^ tmp3; -        out1 = tmp3 ^ out0; -        tmp4 = out1 ^ in7; -        out7 = tmp4 ^ in3; -        out6 = tmp2 ^ tmp4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B3(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in2 ^ in4; -        tmp0 = in0 ^ in5; -        tmp1 = in1 ^ in6; -        out3 = tmp1 ^ in4 ^ in7; -        tmp2 = tmp0 ^ out3; -        out0 = tmp2 ^ in3; -        out1 = tmp2 ^ in2; -        out5 = out0 ^ in2 ^ in6; -        out7 = tmp1 ^ out5; -        out4 = out7 ^ in1 ^ in5 ^ in7; -        out6 = tmp0 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B4(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in0 ^ in1; -        out5 = out4 ^ in2; -        tmp0 = out4 ^ in4; -        out6 = out5 ^ in0 ^ in3; -        out7 = tmp0 ^ out6; -        out2 = tmp0 ^ in6 ^ in7; -        out3 = out7 ^ in0 ^ in7; -        out0 = out5 ^ out7 ^ in5; -        out1 = out0 ^ out6 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B5(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in2 ^ in4; -        out4 = tmp0 ^ in4; -        out3 = tmp1 ^ in7; -        tmp2 = out4 ^ in5; -        out7 = out3 ^ in0 ^ in3; -        out0 = tmp2 ^ in3; -        out2 = tmp0 ^ out3 ^ in6; -        out5 = tmp1 ^ tmp2; -        out6 = out2 ^ out7 ^ in2; -        out1 = tmp0 ^ out0 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B6(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in3 ^ in4; -        tmp0 = in1 ^ in2; -        tmp1 = in0 ^ in4; -        tmp2 = in3 ^ in5; -        tmp3 = out3 ^ in1 ^ in7; -        out5 = tmp0 ^ tmp1; -        out6 = tmp0 ^ tmp2; -        out2 = tmp1 ^ in6; -        out4 = tmp1 ^ tmp3; -        out0 = tmp3 ^ in5; -        out1 = out2 ^ in2 ^ in5; -        out7 = tmp2 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B7(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in4; -        tmp0 = in0 ^ in4; -        out2 = tmp0 ^ in2 ^ in6; -        tmp1 = out2 ^ in7; -        out1 = out2 ^ in1 ^ in5; -        out7 = tmp1 ^ in3; -        out5 = out1 ^ in6; -        out6 = tmp0 ^ out1 ^ in3; -        out0 = tmp1 ^ out6; -        out4 = out0 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B8(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in4; -        tmp1 = in2 ^ in5; -        out2 = tmp0 ^ in5; -        out4 = tmp1 ^ in0; -        tmp2 = tmp1 ^ in7; -        out6 = tmp2 ^ out2; -        out7 = out4 ^ in3; -        out1 = tmp2 ^ in4; -        out3 = tmp0 ^ out7; -        out0 = out3 ^ out4 ^ in6; -        out5 = out0 ^ in0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_B9(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in2; -        tmp1 = in4 ^ in5; -        out4 = tmp0 ^ tmp1; -        tmp2 = tmp0 ^ in3 ^ in7; -        out3 = out4 ^ in1; -        out7 = tmp2 ^ in5; -        out2 = out3 ^ in0; -        out1 = out2 ^ in7; -        out6 = out1 ^ in5 ^ in6; -        out0 = tmp2 ^ out6; -        out5 = tmp1 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_BA(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in5 ^ in7; -        out2 = tmp0 ^ in4; -        tmp1 = out2 ^ in2; -        out1 = tmp1 ^ in0; -        out6 = tmp1 ^ in1; -        out4 = out1 ^ in3 ^ in4; -        tmp2 = out4 ^ out6; -        out7 = out4 ^ in6 ^ in7; -        out5 = tmp2 ^ in6; -        out3 = tmp0 ^ tmp2; -        out0 = out6 ^ out7 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_BB(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in2 ^ in4 ^ in5 ^ in7; -        tmp0 = out2 ^ in1; -        out4 = out2 ^ in0 ^ in3; -        out1 = tmp0 ^ in0; -        out6 = tmp0 ^ in6; -        out3 = out1 ^ in2; -        tmp1 = out4 ^ out6 ^ in4; -        out0 = tmp1 ^ in7; -        out5 = tmp1 ^ in5; -        out7 = tmp0 ^ tmp1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_BC(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in2; -        tmp1 = in2 ^ in4; -        out0 = in1 ^ in3 ^ in4; -        out6 = in1 ^ in2 ^ in7; -        out7 = tmp0 ^ in3; -        out5 = tmp0 ^ out6 ^ in6; -        out1 = tmp1 ^ in5; -        tmp2 = out1 ^ out5 ^ in1; -        out3 = tmp2 ^ in3; -        out4 = tmp1 ^ tmp2; -        out2 = tmp2 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_BD(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = in1 ^ in4; -        out0 = tmp0 ^ tmp1; -        out7 = tmp0 ^ in2 ^ in7; -        out1 = tmp1 ^ in2 ^ in5; -        tmp2 = out1 ^ in0; -        out2 = tmp2 ^ in6; -        out3 = out2 ^ in1 ^ in7; -        out4 = out3 ^ in2; -        out5 = tmp1 ^ out4; -        out6 = tmp2 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_BE(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3 ^ in6; -        out4 = tmp0 ^ in5; -        out7 = tmp0 ^ in2; -        out3 = out4 ^ in4; -        out1 = out3 ^ out7 ^ in0; -        out2 = out3 ^ in3 ^ in7; -        out0 = out2 ^ out4 ^ in1; -        out5 = tmp0 ^ out0; -        out6 = out1 ^ out5 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_BF(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in4; -        out3 = tmp0 ^ in5 ^ in6; -        out4 = out3 ^ in3; -        tmp1 = out3 ^ in7; -        out2 = tmp1 ^ in2; -        out5 = tmp1 ^ in1; -        tmp2 = out2 ^ in5; -        out7 = tmp2 ^ in3 ^ in4; -        tmp3 = tmp0 ^ out5; -        out0 = tmp3 ^ out4; -        out1 = tmp2 ^ tmp3; -        out6 = tmp3 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C0(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in2 ^ in5; -        tmp0 = in1 ^ in4; -        tmp1 = in3 ^ in6; -        out0 = out5 ^ in1; -        out4 = tmp0 ^ in7; -        out3 = tmp0 ^ tmp1; -        out1 = tmp1 ^ in2; -        out6 = tmp1 ^ in0; -        out7 = out4 ^ in0; -        out2 = out4 ^ out5 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C1(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in2; -        tmp0 = in0 ^ in1; -        out4 = in1 ^ in7; -        out6 = in0 ^ in3; -        out3 = in1 ^ in4 ^ in6; -        tmp1 = tmp0 ^ in2; -        out7 = tmp0 ^ in4; -        out0 = tmp1 ^ in5; -        out1 = tmp1 ^ out6 ^ in6; -        out2 = out6 ^ out7 ^ in5 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C2(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in1 ^ in3 ^ in4; -        tmp0 = in0 ^ in3 ^ in6; -        out5 = in2 ^ in4 ^ in5; -        tmp1 = out4 ^ in7; -        out1 = tmp0 ^ in2; -        out6 = tmp0 ^ in5; -        out2 = out5 ^ in3; -        out7 = tmp0 ^ tmp1; -        out3 = tmp1 ^ in2 ^ in6; -        out0 = tmp1 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C3(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in1 ^ in3; -        tmp0 = in0 ^ in2; -        tmp1 = in3 ^ in5; -        out5 = in2 ^ in4; -        tmp2 = tmp0 ^ out4; -        out2 = tmp1 ^ in4; -        out6 = tmp1 ^ in0; -        out0 = tmp1 ^ tmp2 ^ in7; -        out1 = tmp2 ^ in6; -        out7 = out1 ^ out5 ^ in3; -        out3 = tmp0 ^ out7 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C4(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in7; -        out3 = tmp0 ^ in4; -        tmp1 = tmp0 ^ in2; -        out1 = tmp1 ^ in6; -        out5 = tmp1 ^ in5; -        out4 = out1 ^ out3 ^ in1; -        out0 = out4 ^ in4 ^ in5; -        out2 = out0 ^ out3 ^ in0; -        out7 = out1 ^ out2 ^ in7; -        out6 = tmp1 ^ out0 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C5(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in4 ^ in7; -        tmp0 = in3 ^ in7; -        out4 = in1 ^ in2 ^ in6; -        out6 = in0 ^ in3 ^ in4; -        out5 = tmp0 ^ in2; -        out1 = tmp0 ^ out4; -        out0 = out4 ^ in0 ^ in5; -        out2 = out0 ^ out5 ^ in4; -        out7 = tmp0 ^ out2 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C6(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in5 ^ in6; -        tmp1 = in1 ^ in7; -        tmp2 = tmp0 ^ in0; -        tmp3 = tmp0 ^ tmp1; -        tmp4 = tmp2 ^ in4; -        out0 = tmp3 ^ in2; -        out6 = tmp4 ^ in3; -        out2 = out6 ^ in2; -        out7 = tmp1 ^ tmp4; -        out3 = tmp2 ^ out2; -        tmp5 = out3 ^ in5; -        out5 = tmp5 ^ in7; -        out4 = tmp3 ^ tmp5; -        out1 = tmp4 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C7(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in2 ^ in4; -        tmp0 = in3 ^ in5; -        tmp1 = out3 ^ in7; -        out6 = tmp0 ^ in0 ^ in4; -        out5 = tmp1 ^ in3; -        out2 = out6 ^ in6; -        out7 = out2 ^ in1 ^ in3; -        out0 = tmp1 ^ out7; -        out1 = tmp0 ^ out0; -        out4 = out1 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C8(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out0 = in1 ^ in2; -        out1 = in2 ^ in3; -        tmp0 = in5 ^ in6; -        tmp1 = in0 ^ in7; -        out2 = out1 ^ in1 ^ in4; -        out4 = tmp0 ^ in4; -        out5 = tmp0 ^ in7; -        out6 = tmp1 ^ in6; -        out7 = tmp1 ^ in1; -        out3 = out2 ^ in0 ^ in2 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_C9(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in5 ^ in6; -        out7 = in0 ^ in1; -        tmp0 = in1 ^ in3; -        out5 = in6 ^ in7; -        out6 = in0 ^ in7; -        out0 = out7 ^ in2; -        out3 = out7 ^ in4 ^ in5; -        out1 = tmp0 ^ in2; -        out2 = tmp0 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_CA(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in7; -        tmp1 = in2 ^ in7; -        tmp2 = tmp0 ^ in6; -        out0 = tmp1 ^ in1; -        tmp3 = tmp1 ^ in3; -        out6 = tmp2 ^ in5; -        out7 = tmp2 ^ in1; -        out2 = tmp3 ^ in4; -        out5 = out6 ^ in0 ^ in4; -        out4 = out5 ^ in3; -        out1 = tmp0 ^ tmp3; -        out3 = tmp3 ^ out5 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_CB(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in7; -        tmp1 = in5 ^ in7; -        out7 = in0 ^ in1 ^ in6; -        out5 = tmp0 ^ in6; -        out2 = tmp0 ^ in3; -        out6 = tmp1 ^ in0; -        out4 = tmp1 ^ in3 ^ in6; -        tmp2 = out5 ^ out7 ^ in2; -        out1 = tmp2 ^ out2; -        out0 = tmp2 ^ in4; -        out3 = tmp2 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_CC(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in5; -        tmp1 = in1 ^ in6; -        out1 = in2 ^ in3 ^ in7; -        out5 = tmp0 ^ in6; -        out0 = tmp1 ^ in2; -        tmp2 = out5 ^ in0 ^ in7; -        out3 = tmp2 ^ in4; -        out6 = tmp0 ^ out3; -        out7 = tmp1 ^ tmp2 ^ in3; -        tmp3 = out1 ^ out6; -        out4 = tmp2 ^ tmp3; -        out2 = tmp3 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_CD(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in3 ^ in6; -        tmp0 = in0 ^ in1; -        tmp1 = in2 ^ in7; -        out6 = in0 ^ in4 ^ in7; -        out2 = tmp0 ^ out5 ^ in4; -        out7 = tmp0 ^ in5; -        out0 = tmp0 ^ in2 ^ in6; -        out4 = tmp1 ^ in5; -        out1 = tmp1 ^ in1 ^ in3; -        out3 = out6 ^ in5 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_CE(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in5; -        tmp1 = tmp0 ^ in3; -        out4 = tmp1 ^ in4; -        tmp2 = out4 ^ in6; -        out3 = tmp2 ^ in0; -        out5 = tmp2 ^ in2; -        out2 = out3 ^ in5 ^ in7; -        out6 = tmp1 ^ out2; -        out7 = out2 ^ out4 ^ in1; -        out1 = tmp2 ^ out6; -        out0 = tmp0 ^ out7 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_CF(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in6; -        tmp1 = in0 ^ in1 ^ in5; -        out4 = in2 ^ in3 ^ in5; -        out5 = tmp0 ^ in4; -        out7 = tmp1 ^ in6; -        out1 = tmp1 ^ out4 ^ in7; -        tmp2 = out5 ^ in0; -        out2 = tmp2 ^ in7; -        out3 = tmp2 ^ out4; -        out6 = tmp0 ^ out2 ^ in5; -        out0 = tmp0 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D0(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        tmp1 = in1 ^ in4; -        tmp2 = in2 ^ in5; -        out7 = tmp0 ^ tmp1; -        out0 = tmp1 ^ tmp2; -        tmp3 = tmp2 ^ in3; -        out1 = tmp3 ^ in6; -        tmp4 = out1 ^ in1; -        out2 = tmp4 ^ in7; -        out3 = out2 ^ in2; -        out4 = tmp0 ^ out3; -        out5 = tmp3 ^ out3; -        out6 = tmp4 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D1(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in5 ^ in6; -        tmp1 = tmp0 ^ in1; -        out1 = tmp1 ^ in2; -        out2 = tmp1 ^ in7; -        out3 = out2 ^ in3; -        out5 = out3 ^ in2; -        tmp2 = out3 ^ in0; -        out4 = tmp2 ^ in4; -        out7 = tmp0 ^ out4; -        out6 = tmp2 ^ out1 ^ in6; -        out0 = out2 ^ out6 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D2(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in5 ^ in6; -        out2 = tmp0 ^ in2 ^ in3; -        out1 = out2 ^ in0; -        out3 = out2 ^ in1; -        out4 = out1 ^ in1 ^ in2; -        out6 = out1 ^ in6 ^ in7; -        out7 = out4 ^ in4 ^ in5; -        out5 = out4 ^ out6 ^ in4; -        out0 = tmp0 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D3(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in3 ^ in5 ^ in6; -        tmp0 = out2 ^ in2; -        tmp1 = tmp0 ^ in1; -        out1 = tmp1 ^ in0; -        out3 = tmp1 ^ in3; -        out4 = out1 ^ in2 ^ in4; -        tmp2 = out4 ^ in5; -        out7 = tmp2 ^ in7; -        out0 = tmp0 ^ out7; -        tmp3 = out0 ^ in0; -        out5 = tmp3 ^ in6; -        out6 = tmp2 ^ tmp3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D4(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in3 ^ in5; -        tmp0 = in1 ^ in5; -        tmp1 = tmp0 ^ in2; -        out4 = tmp1 ^ in0; -        tmp2 = tmp1 ^ in6; -        out2 = out4 ^ in3 ^ in7; -        out0 = tmp2 ^ in4; -        out5 = tmp2 ^ out3; -        out1 = tmp0 ^ out5 ^ in7; -        out6 = tmp0 ^ out2 ^ in4; -        out7 = tmp1 ^ out6 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D5(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in5; -        tmp0 = in0 ^ in4; -        tmp1 = tmp0 ^ in1 ^ in5; -        out4 = tmp1 ^ in2; -        out0 = out4 ^ in6; -        tmp2 = tmp0 ^ out0; -        out5 = tmp2 ^ in3; -        out1 = out5 ^ in7; -        out6 = tmp1 ^ out1; -        out7 = tmp2 ^ out6; -        out2 = out7 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D6(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2 ^ in4 ^ in6; -        out5 = tmp0 ^ in3; -        out0 = tmp0 ^ in5 ^ in7; -        out3 = out0 ^ out5 ^ in2; -        tmp1 = out3 ^ in0; -        out1 = tmp1 ^ in6; -        out2 = tmp1 ^ in7; -        out4 = tmp1 ^ in1; -        out6 = tmp1 ^ in4; -        out7 = tmp0 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D7(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in3; -        out3 = in2 ^ in5 ^ in7; -        out2 = tmp0 ^ in5; -        tmp1 = tmp0 ^ out3 ^ in1; -        out1 = tmp1 ^ in6; -        out4 = tmp1 ^ in4; -        tmp2 = out1 ^ in4; -        out6 = tmp2 ^ in1; -        out7 = tmp2 ^ in2; -        out0 = tmp2 ^ in3; -        out5 = tmp2 ^ in0 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D8(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in0; -        out5 = in1; -        tmp0 = in1 ^ in2; -        out6 = in0 ^ in2; -        out0 = tmp0 ^ in4; -        tmp1 = tmp0 ^ in3; -        out7 = tmp1 ^ out6; -        out2 = tmp1 ^ in6; -        out3 = out7 ^ in7; -        out1 = tmp1 ^ in1 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_D9(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in0 ^ in4; -        out5 = in1 ^ in5; -        out2 = in1 ^ in3 ^ in6; -        out3 = in0 ^ in1 ^ in7; -        out6 = in0 ^ in2 ^ in6; -        out0 = out4 ^ in1 ^ in2; -        out1 = out5 ^ in2 ^ in3; -        out7 = out3 ^ in3; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_DA(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out5 = in1 ^ in4; -        tmp0 = in2 ^ in7; -        tmp1 = in0 ^ in2 ^ in3; -        out0 = tmp0 ^ out5; -        out4 = tmp0 ^ tmp1; -        out2 = tmp0 ^ in3 ^ in6; -        out1 = tmp1 ^ in5; -        out3 = tmp1 ^ in1; -        out6 = out1 ^ in3; -        out7 = out3 ^ in2 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_DB(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in1 ^ in5; -        tmp2 = in3 ^ in7; -        out3 = tmp0 ^ in2; -        out5 = tmp1 ^ in4; -        out6 = tmp1 ^ out3 ^ in6; -        out2 = tmp2 ^ in6; -        tmp3 = tmp2 ^ in4; -        tmp4 = out3 ^ in3; -        out4 = tmp3 ^ in0; -        out1 = tmp4 ^ in5; -        out0 = tmp3 ^ tmp4; -        out7 = tmp0 ^ out2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_DC(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in2; -        tmp1 = in0 ^ in3; -        out6 = tmp0 ^ in4; -        tmp2 = tmp0 ^ in7; -        out3 = tmp1 ^ in6; -        tmp3 = tmp1 ^ in1; -        out1 = tmp1 ^ tmp2 ^ in5; -        out4 = tmp2 ^ in6; -        out2 = tmp3 ^ in2; -        out7 = tmp3 ^ in5; -        out5 = tmp2 ^ out2; -        out0 = out2 ^ out3 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_DD(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0 ^ in6; -        out2 = in0 ^ in1 ^ in3; -        out6 = out3 ^ in2 ^ in4; -        out7 = out2 ^ in5 ^ in7; -        out0 = out6 ^ in1; -        out4 = out6 ^ in7; -        out5 = out7 ^ in0; -        out1 = out5 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_DE(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3 ^ in6; -        tmp1 = in3 ^ in4 ^ in7; -        out4 = tmp0 ^ in0; -        out5 = tmp1 ^ in1; -        out3 = out4 ^ in7; -        out2 = out3 ^ in6; -        out1 = out2 ^ in5; -        out6 = tmp1 ^ out1; -        out0 = tmp0 ^ out5; -        out7 = out0 ^ out1 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_DF(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0 ^ in3 ^ in7; -        tmp0 = out2 ^ in1 ^ in5; -        out1 = tmp0 ^ in2; -        out7 = tmp0 ^ in6; -        out5 = tmp0 ^ in0 ^ in4; -        tmp1 = out1 ^ out5 ^ in6; -        out4 = tmp1 ^ in3; -        out6 = tmp1 ^ in5; -        tmp2 = tmp1 ^ in7; -        out0 = tmp2 ^ in1; -        out3 = tmp2 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E0(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in1 ^ in7; -        tmp0 = in2 ^ in4; -        out4 = out3 ^ in3 ^ in5; -        out2 = tmp0 ^ in1; -        tmp1 = tmp0 ^ in6; -        out0 = out4 ^ in2; -        out6 = out4 ^ in0; -        out1 = tmp1 ^ in3; -        out5 = tmp1 ^ in0; -        out7 = out5 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E1(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in1 ^ in4; -        tmp0 = in1 ^ in7; -        out3 = tmp0 ^ in3; -        tmp1 = out3 ^ in5; -        out4 = tmp1 ^ in4; -        tmp2 = tmp1 ^ in0; -        out0 = tmp2 ^ in2; -        out6 = tmp2 ^ in6; -        tmp3 = out0 ^ out4 ^ in6; -        out5 = tmp3 ^ in5; -        out7 = tmp0 ^ tmp3; -        out1 = tmp2 ^ out5 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E2(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in1 ^ in2; -        out4 = in1 ^ in5; -        out2 = in2 ^ in4 ^ in7; -        out5 = in0 ^ in2 ^ in6; -        out0 = out3 ^ in3 ^ in5; -        out7 = out3 ^ in0 ^ in4; -        out6 = out2 ^ out7 ^ in3; -        out1 = out5 ^ in3 ^ in4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E3(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in4 ^ in7; -        tmp0 = in1 ^ in3; -        out3 = tmp0 ^ in2; -        tmp1 = out3 ^ in0; -        out0 = tmp1 ^ in5; -        tmp2 = tmp1 ^ in4; -        out1 = tmp2 ^ in6; -        tmp3 = tmp2 ^ in3; -        out7 = tmp3 ^ in7; -        out6 = out1 ^ out2 ^ in2; -        tmp4 = tmp0 ^ out0; -        out5 = tmp4 ^ in6; -        out4 = tmp3 ^ tmp4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E4(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in6; -        tmp0 = in0 ^ in4; -        tmp1 = tmp0 ^ in2 ^ in6; -        out2 = tmp1 ^ in1; -        out7 = out2 ^ in5; -        tmp2 = tmp0 ^ out7; -        out4 = tmp2 ^ in3; -        out0 = out4 ^ in7; -        out6 = tmp1 ^ out0; -        out5 = tmp2 ^ out6; -        out1 = out5 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E5(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in3 ^ in6; -        tmp0 = in0 ^ in1; -        tmp1 = in5 ^ in7; -        out2 = tmp0 ^ in4 ^ in6; -        tmp2 = tmp1 ^ out2; -        out6 = tmp2 ^ in3; -        out7 = tmp2 ^ in2; -        out0 = out6 ^ in2 ^ in4; -        out5 = out6 ^ in1 ^ in2; -        out1 = tmp0 ^ out5 ^ in5; -        out4 = tmp1 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E6(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in2 ^ in6 ^ in7; -        out2 = out3 ^ in0 ^ in4; -        out4 = out3 ^ in1 ^ in5; -        out1 = out2 ^ in3; -        out7 = out2 ^ out4 ^ in2; -        out0 = out4 ^ in3 ^ in7; -        out5 = out1 ^ in4; -        out6 = out0 ^ out2 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E7(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in3; -        out3 = tmp0 ^ in6 ^ in7; -        tmp1 = out3 ^ in0; -        out5 = tmp1 ^ in5; -        tmp2 = tmp1 ^ in4; -        tmp3 = out5 ^ in7; -        out1 = tmp2 ^ in1; -        out0 = tmp3 ^ in1; -        out6 = out1 ^ in2; -        out2 = tmp0 ^ tmp2; -        tmp4 = tmp3 ^ out6; -        out4 = tmp4 ^ in6; -        out7 = tmp4 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E8(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in3 ^ in6; -        tmp0 = in4 ^ in7; -        out1 = in2 ^ in3 ^ in4; -        out5 = tmp0 ^ in0; -        tmp1 = tmp0 ^ in1; -        tmp2 = tmp1 ^ in5; -        out0 = tmp1 ^ out1; -        out2 = tmp2 ^ in2; -        out6 = tmp2 ^ out5; -        tmp3 = out6 ^ in6; -        out3 = tmp3 ^ in7; -        out7 = tmp3 ^ in2 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_E9(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = in3 ^ in6; -        tmp2 = tmp0 ^ in6; -        out4 = tmp1 ^ in4; -        out6 = tmp2 ^ in5; -        out7 = tmp2 ^ in2 ^ in7; -        out3 = out6 ^ in3 ^ in7; -        out0 = tmp1 ^ out7; -        out2 = out3 ^ out4 ^ in0; -        out5 = tmp0 ^ out2; -        out1 = out0 ^ out5 ^ in5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_EA(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in6 ^ in7; -        out5 = in0 ^ in7; -        out6 = in0 ^ in1; -        out0 = in1 ^ in2 ^ in3; -        out2 = in2 ^ in4 ^ in5; -        out7 = out6 ^ in2; -        out1 = out0 ^ out6 ^ in4; -        out3 = out7 ^ in5 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_EB(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in4 ^ in5; -        tmp0 = in0 ^ in1; -        out4 = in4 ^ in6 ^ in7; -        out5 = in0 ^ in5 ^ in7; -        out6 = tmp0 ^ in6; -        tmp1 = tmp0 ^ in2; -        out0 = tmp1 ^ in3; -        out7 = tmp1 ^ in7; -        out1 = out0 ^ in4; -        out3 = out0 ^ in5 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_EC(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out3 = in0 ^ in5; -        out4 = in2 ^ in3 ^ in7; -        out5 = in0 ^ in3 ^ in4; -        out6 = out3 ^ in1 ^ in4; -        out1 = out4 ^ in4; -        out0 = out4 ^ in1 ^ in6; -        out2 = out0 ^ out5 ^ in5; -        out7 = out2 ^ in4 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_ED(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in2 ^ in4; -        tmp1 = in3 ^ in5; -        out4 = tmp0 ^ in3 ^ in7; -        out3 = tmp1 ^ in0; -        out1 = out4 ^ in1; -        out5 = out3 ^ in4; -        out7 = out1 ^ out5 ^ in6; -        out2 = tmp0 ^ out7; -        out0 = tmp1 ^ out7; -        out6 = out2 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_EE(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in2; -        tmp0 = in0 ^ in1; -        out5 = in0 ^ in3; -        tmp1 = tmp0 ^ in2; -        out6 = tmp0 ^ in4; -        tmp2 = tmp1 ^ out5; -        out7 = tmp1 ^ in5; -        out1 = tmp2 ^ out6 ^ in7; -        out0 = tmp2 ^ in6; -        tmp3 = out7 ^ in1; -        out3 = tmp3 ^ in7; -        out2 = tmp3 ^ in4 ^ in6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_EF(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out4 = in2 ^ in4; -        tmp0 = in0 ^ in5; -        tmp1 = in4 ^ in6; -        out5 = tmp0 ^ in3; -        out2 = tmp0 ^ tmp1; -        out6 = tmp1 ^ in0 ^ in1; -        out3 = out5 ^ in2 ^ in7; -        out7 = out3 ^ in1 ^ in3; -        out0 = out4 ^ out6 ^ in3; -        out1 = tmp1 ^ out0 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F0(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = in4 ^ in5; -        out2 = tmp0 ^ in6; -        out3 = tmp1 ^ in1; -        tmp2 = tmp1 ^ in7; -        out1 = out2 ^ out3 ^ in3; -        tmp3 = tmp0 ^ tmp2; -        out0 = tmp3 ^ in3; -        out5 = tmp3 ^ in0; -        out4 = out1 ^ out5 ^ in4; -        out7 = out4 ^ in2; -        out6 = tmp2 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F1(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in1 ^ in6; -        tmp0 = in3 ^ in5; -        out3 = tmp0 ^ in1 ^ in4; -        tmp1 = out3 ^ in2; -        out1 = tmp1 ^ in6; -        tmp2 = tmp1 ^ in0; -        tmp3 = out1 ^ in5; -        out0 = tmp2 ^ in7; -        out6 = tmp2 ^ in4; -        out7 = tmp3 ^ in0; -        out5 = tmp0 ^ out0; -        out4 = tmp3 ^ out5 ^ in1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F2(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in4 ^ in5; -        out2 = in2 ^ in6 ^ in7; -        tmp1 = tmp0 ^ in1; -        tmp2 = tmp1 ^ in2; -        out0 = tmp2 ^ in3; -        out3 = tmp2 ^ in7; -        out5 = out3 ^ in0 ^ in4; -        tmp3 = tmp0 ^ out5; -        out7 = tmp3 ^ in3; -        out4 = tmp3 ^ out2; -        out1 = out0 ^ out4 ^ in4; -        out6 = tmp1 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F3(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in6 ^ in7; -        tmp0 = in0 ^ in1; -        out4 = tmp0 ^ in6; -        tmp1 = tmp0 ^ in2; -        out5 = tmp1 ^ in7; -        out6 = tmp1 ^ in3; -        out7 = out6 ^ in4; -        out0 = out7 ^ in5; -        out1 = out0 ^ in6; -        out3 = out0 ^ in0 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F4(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0 ^ in1 ^ in2; -        tmp0 = out2 ^ in3; -        out4 = tmp0 ^ in4; -        out5 = out4 ^ in5; -        out6 = out5 ^ in6; -        out7 = out6 ^ in7; -        out0 = out7 ^ in0; -        out1 = out0 ^ in1; -        out3 = tmp0 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F5(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0 ^ in1; -        tmp0 = out2 ^ in2; -        out4 = tmp0 ^ in3; -        out5 = out4 ^ in4; -        out6 = out5 ^ in5; -        out7 = out6 ^ in6; -        out0 = out7 ^ in7; -        out1 = out0 ^ in0; -        out3 = tmp0 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F6(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in7; -        out2 = tmp0 ^ in2; -        out4 = out2 ^ in1 ^ in4; -        out7 = out4 ^ in3 ^ in5; -        out5 = out7 ^ in4 ^ in7; -        out0 = tmp0 ^ out7 ^ in6; -        tmp1 = out0 ^ in1; -        out6 = out0 ^ in0 ^ in5; -        out3 = tmp1 ^ in3; -        out1 = tmp0 ^ tmp1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F7(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0 ^ in7; -        tmp0 = out2 ^ in1; -        out4 = tmp0 ^ in2; -        out5 = out4 ^ in3 ^ in7; -        out6 = out5 ^ in4; -        out7 = out6 ^ in5; -        out0 = out7 ^ in6; -        out1 = out0 ^ in7; -        out3 = tmp0 ^ out1; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F8(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in4; -        tmp1 = in3 ^ in5; -        tmp2 = tmp0 ^ in6; -        out4 = tmp0 ^ tmp1; -        out1 = tmp1 ^ in2 ^ in4; -        out3 = tmp2 ^ in1; -        out5 = out3 ^ in5; -        out7 = out1 ^ out5 ^ in7; -        out6 = tmp1 ^ out7; -        out0 = tmp2 ^ out7; -        out2 = out6 ^ in0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_F9(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in3 ^ in5; -        tmp1 = in0 ^ in6; -        out4 = tmp0 ^ in0; -        tmp2 = tmp1 ^ in4; -        tmp3 = tmp1 ^ in2; -        out5 = tmp2 ^ in1; -        out3 = out5 ^ in3; -        tmp4 = tmp3 ^ out3; -        out1 = tmp4 ^ in5; -        out0 = tmp4 ^ in0 ^ in7; -        out6 = tmp0 ^ out0 ^ in4; -        out7 = tmp2 ^ tmp4; -        out2 = tmp3 ^ out6; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_FA(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in1; -        tmp1 = tmp0 ^ in2; -        tmp2 = tmp0 ^ in5; -        tmp3 = tmp1 ^ in7; -        out5 = tmp2 ^ in6; -        out6 = tmp3 ^ in6; -        out7 = tmp3 ^ in3; -        out3 = out6 ^ in4; -        out2 = tmp1 ^ out5; -        out4 = out2 ^ out3 ^ in1; -        out0 = out4 ^ out7 ^ in5; -        out1 = tmp2 ^ out0; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_FB(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in5 ^ in6; -        tmp0 = in0 ^ in1; -        out4 = in0 ^ in5 ^ in7; -        out5 = tmp0 ^ in6; -        tmp1 = tmp0 ^ in2; -        out6 = tmp1 ^ in7; -        out7 = tmp1 ^ in3; -        out0 = out7 ^ in4; -        out1 = out0 ^ in5; -        out3 = out0 ^ in6 ^ in7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_FC(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in1 ^ in2; -        tmp1 = in0 ^ in7; -        out2 = tmp0 ^ tmp1 ^ in5; -        out3 = tmp1 ^ in4; -        tmp2 = out2 ^ in6; -        out6 = tmp2 ^ in4; -        out7 = tmp2 ^ in3; -        out4 = out6 ^ in1 ^ in3; -        tmp3 = out4 ^ in0; -        out1 = tmp3 ^ in6; -        out0 = tmp3 ^ in1 ^ in5; -        out5 = tmp0 ^ out4; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_FD(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in5; -        tmp1 = in1 ^ in7; -        out2 = tmp0 ^ tmp1; -        out6 = out2 ^ in2 ^ in4; -        tmp2 = out6 ^ in0; -        out1 = tmp2 ^ in3; -        out0 = tmp0 ^ out1 ^ in6; -        out5 = out0 ^ in2; -        tmp3 = out5 ^ in1; -        out3 = tmp3 ^ in6; -        out7 = tmp2 ^ tmp3; -        out4 = tmp1 ^ out7; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_FE(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        tmp0 = in0 ^ in2; -        out2 = tmp0 ^ in5; -        out3 = tmp0 ^ in4; -        tmp1 = out3 ^ in6; -        out4 = tmp1 ^ in5; -        tmp2 = tmp1 ^ in1; -        out6 = tmp2 ^ in7; -        tmp3 = tmp2 ^ in0; -        out0 = tmp3 ^ in3; -        tmp4 = out0 ^ out4 ^ in7; -        out5 = tmp4 ^ in6; -        out7 = tmp4 ^ in2; -        out1 = tmp3 ^ out5; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -static void gf8_muladd_FF(uint8_t * out, uint8_t * in, unsigned int width) -{ -    unsigned int i; -    uint64_t * in_ptr = (uint64_t *)in; -    uint64_t * out_ptr = (uint64_t *)out; - -    for (i = 0; i < width; i++) -    { -        uint64_t out0, out1, out2, out3, out4, out5, out6, out7; -        uint64_t tmp0, tmp1, tmp2, tmp3; - -        uint64_t in0 = out_ptr[0]; -        uint64_t in1 = out_ptr[width]; -        uint64_t in2 = out_ptr[width * 2]; -        uint64_t in3 = out_ptr[width * 3]; -        uint64_t in4 = out_ptr[width * 4]; -        uint64_t in5 = out_ptr[width * 5]; -        uint64_t in6 = out_ptr[width * 6]; -        uint64_t in7 = out_ptr[width * 7]; - -        out2 = in0 ^ in5; -        tmp0 = in4 ^ in7; -        tmp1 = out2 ^ in2; -        out4 = tmp1 ^ in6; -        out7 = tmp1 ^ in1 ^ in3; -        out1 = tmp0 ^ out7; -        tmp2 = out1 ^ in5; -        out6 = tmp2 ^ in3; -        tmp3 = tmp2 ^ in7; -        out0 = tmp3 ^ in6; -        out3 = tmp3 ^ in1; -        out5 = tmp0 ^ out0 ^ in2; - -        out_ptr[0] = out0 ^ in_ptr[0]; -        out_ptr[width] = out1 ^ in_ptr[width]; -        out_ptr[width * 2] = out2 ^ in_ptr[width * 2]; -        out_ptr[width * 3] = out3 ^ in_ptr[width * 3]; -        out_ptr[width * 4] = out4 ^ in_ptr[width * 4]; -        out_ptr[width * 5] = out5 ^ in_ptr[width * 5]; -        out_ptr[width * 6] = out6 ^ in_ptr[width * 6]; -        out_ptr[width * 7] = out7 ^ in_ptr[width * 7]; - -        in_ptr++; -        out_ptr++; -    } -} - -void (* ec_gf_muladd[])(uint8_t * out, uint8_t * in, unsigned int width) = -{ -    gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, -    gf8_muladd_04, gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, -    gf8_muladd_08, gf8_muladd_09, gf8_muladd_0A, gf8_muladd_0B, -    gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E, gf8_muladd_0F, -    gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13, -    gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, -    gf8_muladd_18, gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, -    gf8_muladd_1C, gf8_muladd_1D, gf8_muladd_1E, gf8_muladd_1F, -    gf8_muladd_20, gf8_muladd_21, gf8_muladd_22, gf8_muladd_23, -    gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27, -    gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, -    gf8_muladd_2C, gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, -    gf8_muladd_30, gf8_muladd_31, gf8_muladd_32, gf8_muladd_33, -    gf8_muladd_34, gf8_muladd_35, gf8_muladd_36, gf8_muladd_37, -    gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B, -    gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, -    gf8_muladd_40, gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, -    gf8_muladd_44, gf8_muladd_45, gf8_muladd_46, gf8_muladd_47, -    gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A, gf8_muladd_4B, -    gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F, -    gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, -    gf8_muladd_54, gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, -    gf8_muladd_58, gf8_muladd_59, gf8_muladd_5A, gf8_muladd_5B, -    gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E, gf8_muladd_5F, -    gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63, -    gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, -    gf8_muladd_68, gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, -    gf8_muladd_6C, gf8_muladd_6D, gf8_muladd_6E, gf8_muladd_6F, -    gf8_muladd_70, gf8_muladd_71, gf8_muladd_72, gf8_muladd_73, -    gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77, -    gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, -    gf8_muladd_7C, gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, -    gf8_muladd_80, gf8_muladd_81, gf8_muladd_82, gf8_muladd_83, -    gf8_muladd_84, gf8_muladd_85, gf8_muladd_86, gf8_muladd_87, -    gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B, -    gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, -    gf8_muladd_90, gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, -    gf8_muladd_94, gf8_muladd_95, gf8_muladd_96, gf8_muladd_97, -    gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A, gf8_muladd_9B, -    gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F, -    gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, -    gf8_muladd_A4, gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, -    gf8_muladd_A8, gf8_muladd_A9, gf8_muladd_AA, gf8_muladd_AB, -    gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE, gf8_muladd_AF, -    gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3, -    gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, -    gf8_muladd_B8, gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, -    gf8_muladd_BC, gf8_muladd_BD, gf8_muladd_BE, gf8_muladd_BF, -    gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2, gf8_muladd_C3, -    gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7, -    gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, -    gf8_muladd_CC, gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, -    gf8_muladd_D0, gf8_muladd_D1, gf8_muladd_D2, gf8_muladd_D3, -    gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6, gf8_muladd_D7, -    gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB, -    gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, -    gf8_muladd_E0, gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, -    gf8_muladd_E4, gf8_muladd_E5, gf8_muladd_E6, gf8_muladd_E7, -    gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA, gf8_muladd_EB, -    gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF, -    gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, -    gf8_muladd_F4, gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, -    gf8_muladd_F8, gf8_muladd_F9, gf8_muladd_FA, gf8_muladd_FB, -    gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE, gf8_muladd_FF -}; diff --git a/xlators/cluster/ec/src/ec-gf8.c b/xlators/cluster/ec/src/ec-gf8.c new file mode 100644 index 00000000000..2665632706b --- /dev/null +++ b/xlators/cluster/ec/src/ec-gf8.c @@ -0,0 +1,5959 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include "ec-gf8.h" + +static ec_gf_op_t ec_gf8_mul_00_ops[] = { +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_00 = { +    0, +    { 0, }, +    ec_gf8_mul_00_ops +}; + +static ec_gf_op_t ec_gf8_mul_01_ops[] = { +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_01 = { +    8, +    { 0, 1, 2, 3, 4, 5, 6, 7, }, +    ec_gf8_mul_01_ops +}; + +static ec_gf_op_t ec_gf8_mul_02_ops[] = { +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_02 = { +    8, +    { 7, 0, 1, 2, 3, 4, 5, 6, }, +    ec_gf8_mul_02_ops +}; + +static ec_gf_op_t ec_gf8_mul_03_ops[] = { +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_03 = { +    9, +    { 0, 1, 2, 3, 4, 5, 6, 7, 8, }, +    ec_gf8_mul_03_ops +}; + +static ec_gf_op_t ec_gf8_mul_04_ops[] = { +    { EC_GF_OP_XOR3,   8,  6,  7 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_04 = { +    9, +    { 6, 7, 0, 1, 2, 3, 4, 5, 8, }, +    ec_gf8_mul_04_ops +}; + +static ec_gf_op_t ec_gf8_mul_05_ops[] = { +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_05 = { +    8, +    { 0, 1, 2, 6, 7, 3, 4, 5, }, +    ec_gf8_mul_05_ops +}; + +static ec_gf_op_t ec_gf8_mul_06_ops[] = { +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_COPY,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_06 = { +    9, +    { 7, 0, 1, 2, 8, 3, 4, 5, 6, }, +    ec_gf8_mul_06_ops +}; + +static ec_gf_op_t ec_gf8_mul_07_ops[] = { +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_07 = { +    8, +    { 6, 0, 1, 3, 2, 4, 5, 7, }, +    ec_gf8_mul_07_ops +}; + +static ec_gf_op_t ec_gf8_mul_08_ops[] = { +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  6,  7 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_08 = { +    9, +    { 5, 6, 7, 0, 1, 2, 3, 4, 8, }, +    ec_gf8_mul_08_ops +}; + +static ec_gf_op_t ec_gf8_mul_09_ops[] = { +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_09 = { +    8, +    { 0, 1, 2, 3, 5, 6, 7, 4, }, +    ec_gf8_mul_09_ops +}; + +static ec_gf_op_t ec_gf8_mul_0A_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0A = { +    8, +    { 5, 0, 1, 2, 6, 7, 3, 4, }, +    ec_gf8_mul_0A_ops +}; + +static ec_gf_op_t ec_gf8_mul_0B_ops[] = { +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_COPY,   9,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_COPY,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR3,   3,  8,  6 }, +    { EC_GF_OP_XOR2,   1,  9,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0B = { +    10, +    { 7, 1, 5, 2, 4, 3, 0, 6, 8, 9, }, +    ec_gf8_mul_0B_ops +}; + +static ec_gf_op_t ec_gf8_mul_0C_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0C = { +    9, +    { 5, 7, 0, 1, 8, 2, 3, 4, 6, }, +    ec_gf8_mul_0C_ops +}; + +static ec_gf_op_t ec_gf8_mul_0D_ops[] = { +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR3,   8,  2,  4 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR3,   2,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0D = { +    9, +    { 5, 6, 7, 3, 1, 0, 2, 4, 8, }, +    ec_gf8_mul_0D_ops +}; + +static ec_gf_op_t ec_gf8_mul_0E_ops[] = { +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0E = { +    8, +    { 7, 0, 6, 1, 3, 2, 4, 5, }, +    ec_gf8_mul_0E_ops +}; + +static ec_gf_op_t ec_gf8_mul_0F_ops[] = { +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_0F = { +    8, +    { 1, 0, 5, 6, 7, 2, 3, 4, }, +    ec_gf8_mul_0F_ops +}; + +static ec_gf_op_t ec_gf8_mul_10_ops[] = { +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_10 = { +    8, +    { 4, 5, 6, 7, 0, 1, 2, 3, }, +    ec_gf8_mul_10_ops +}; + +static ec_gf_op_t ec_gf8_mul_11_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_11 = { +    8, +    { 4, 1, 2, 6, 0, 5, 7, 3, }, +    ec_gf8_mul_11_ops +}; + +static ec_gf_op_t ec_gf8_mul_12_ops[] = { +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_12 = { +    8, +    { 7, 0, 1, 2, 3, 5, 6, 4, }, +    ec_gf8_mul_12_ops +}; + +static ec_gf_op_t ec_gf8_mul_13_ops[] = { +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  7 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_13 = { +    9, +    { 4, 5, 2, 6, 0, 1, 7, 3, 8, }, +    ec_gf8_mul_13_ops +}; + +static ec_gf_op_t ec_gf8_mul_14_ops[] = { +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_14 = { +    8, +    { 6, 7, 0, 1, 2, 4, 5, 3, }, +    ec_gf8_mul_14_ops +}; + +static ec_gf_op_t ec_gf8_mul_15_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR3,   5,  8,  7 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_15 = { +    9, +    { 0, 1, 2, 4, 7, 6, 5, 3, 8, }, +    ec_gf8_mul_15_ops +}; + +static ec_gf_op_t ec_gf8_mul_16_ops[] = { +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_16 = { +    8, +    { 6, 7, 4, 1, 2, 3, 5, 0, }, +    ec_gf8_mul_16_ops +}; + +static ec_gf_op_t ec_gf8_mul_17_ops[] = { +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_17 = { +    8, +    { 5, 7, 0, 1, 3, 2, 4, 6, }, +    ec_gf8_mul_17_ops +}; + +static ec_gf_op_t ec_gf8_mul_18_ops[] = { +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_18 = { +    9, +    { 4, 5, 7, 6, 0, 1, 2, 3, 8, }, +    ec_gf8_mul_18_ops +}; + +static ec_gf_op_t ec_gf8_mul_19_ops[] = { +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_19 = { +    8, +    { 0, 5, 2, 6, 7, 1, 3, 4, }, +    ec_gf8_mul_19_ops +}; + +static ec_gf_op_t ec_gf8_mul_1A_ops[] = { +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1A = { +    8, +    { 7, 0, 4, 5, 3, 1, 2, 6, }, +    ec_gf8_mul_1A_ops +}; + +static ec_gf_op_t ec_gf8_mul_1B_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1B = { +    8, +    { 7, 4, 5, 6, 3, 1, 2, 0, }, +    ec_gf8_mul_1B_ops +}; + +static ec_gf_op_t ec_gf8_mul_1C_ops[] = { +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1C = { +    8, +    { 5, 4, 3, 0, 1, 7, 2, 6, }, +    ec_gf8_mul_1C_ops +}; + +static ec_gf_op_t ec_gf8_mul_1D_ops[] = { +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR3,   8,  4,  2 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1D = { +    9, +    { 0, 7, 5, 8, 2, 3, 4, 1, 6, }, +    ec_gf8_mul_1D_ops +}; + +static ec_gf_op_t ec_gf8_mul_1E_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1E = { +    8, +    { 4, 7, 5, 1, 6, 0, 2, 3, }, +    ec_gf8_mul_1E_ops +}; + +static ec_gf_op_t ec_gf8_mul_1F_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  7 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_1F = { +    9, +    { 1, 4, 5, 6, 7, 0, 3, 2, 8, }, +    ec_gf8_mul_1F_ops +}; + +static ec_gf_op_t ec_gf8_mul_20_ops[] = { +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_20 = { +    8, +    { 7, 4, 5, 6, 3, 0, 1, 2, }, +    ec_gf8_mul_20_ops +}; + +static ec_gf_op_t ec_gf8_mul_21_ops[] = { +    { EC_GF_OP_COPY,   9,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  7,  5 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  9,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_21 = { +    10, +    { 0, 1, 2, 7, 5, 4, 3, 6, 8, 9, }, +    ec_gf8_mul_21_ops +}; + +static ec_gf_op_t ec_gf8_mul_22_ops[] = { +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_22 = { +    8, +    { 3, 0, 5, 2, 6, 4, 1, 7, }, +    ec_gf8_mul_22_ops +}; + +static ec_gf_op_t ec_gf8_mul_23_ops[] = { +    { EC_GF_OP_COPY,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_23 = { +    9, +    { 0, 4, 3, 2, 5, 6, 1, 8, 7, }, +    ec_gf8_mul_23_ops +}; + +static ec_gf_op_t ec_gf8_mul_24_ops[] = { +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_24 = { +    8, +    { 6, 7, 0, 1, 2, 4, 5, 3, }, +    ec_gf8_mul_24_ops +}; + +static ec_gf_op_t ec_gf8_mul_25_ops[] = { +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_25 = { +    8, +    { 2, 7, 0, 1, 3, 4, 5, 6, }, +    ec_gf8_mul_25_ops +}; + +static ec_gf_op_t ec_gf8_mul_26_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_26 = { +    8, +    { 3, 4, 1, 2, 0, 5, 6, 7, }, +    ec_gf8_mul_26_ops +}; + +static ec_gf_op_t ec_gf8_mul_27_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_27 = { +    8, +    { 3, 0, 1, 2, 6, 7, 4, 5, }, +    ec_gf8_mul_27_ops +}; + +static ec_gf_op_t ec_gf8_mul_28_ops[] = { +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_28 = { +    8, +    { 5, 6, 3, 0, 1, 2, 4, 7, }, +    ec_gf8_mul_28_ops +}; + +static ec_gf_op_t ec_gf8_mul_29_ops[] = { +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_29 = { +    8, +    { 4, 6, 3, 5, 7, 0, 1, 2, }, +    ec_gf8_mul_29_ops +}; + +static ec_gf_op_t ec_gf8_mul_2A_ops[] = { +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR3,   6,  8,  4 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2A = { +    9, +    { 3, 4, 7, 2, 6, 5, 1, 0, 8, }, +    ec_gf8_mul_2A_ops +}; + +static ec_gf_op_t ec_gf8_mul_2B_ops[] = { +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2B = { +    8, +    { 3, 4, 7, 5, 6, 0, 1, 2, }, +    ec_gf8_mul_2B_ops +}; + +static ec_gf_op_t ec_gf8_mul_2C_ops[] = { +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2C = { +    8, +    { 5, 6, 7, 0, 2, 3, 4, 1, }, +    ec_gf8_mul_2C_ops +}; + +static ec_gf_op_t ec_gf8_mul_2D_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR3,   8,  4,  6 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2D = { +    9, +    { 7, 0, 3, 5, 1, 4, 2, 6, 8, }, +    ec_gf8_mul_2D_ops +}; + +static ec_gf_op_t ec_gf8_mul_2E_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_COPY,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2E = { +    9, +    { 5, 0, 7, 3, 2, 6, 4, 1, 8, }, +    ec_gf8_mul_2E_ops +}; + +static ec_gf_op_t ec_gf8_mul_2F_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR3,   8,  7,  6 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_2F = { +    9, +    { 6, 3, 2, 5, 7, 0, 1, 4, 8, }, +    ec_gf8_mul_2F_ops +}; + +static ec_gf_op_t ec_gf8_mul_30_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR3,   6,  8,  7 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_30 = { +    9, +    { 3, 4, 7, 5, 0, 6, 1, 2, 8, }, +    ec_gf8_mul_30_ops +}; + +static ec_gf_op_t ec_gf8_mul_31_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_31 = { +    8, +    { 7, 1, 4, 5, 6, 0, 2, 3, }, +    ec_gf8_mul_31_ops +}; + +static ec_gf_op_t ec_gf8_mul_32_ops[] = { +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_32 = { +    8, +    { 3, 4, 6, 7, 5, 0, 1, 2, }, +    ec_gf8_mul_32_ops +}; + +static ec_gf_op_t ec_gf8_mul_33_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_33 = { +    8, +    { 5, 4, 3, 0, 2, 1, 6, 7, }, +    ec_gf8_mul_33_ops +}; + +static ec_gf_op_t ec_gf8_mul_34_ops[] = { +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_34 = { +    8, +    { 7, 5, 3, 0, 2, 4, 1, 6, }, +    ec_gf8_mul_34_ops +}; + +static ec_gf_op_t ec_gf8_mul_35_ops[] = { +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_35 = { +    8, +    { 6, 7, 5, 4, 2, 0, 1, 3, }, +    ec_gf8_mul_35_ops +}; + +static ec_gf_op_t ec_gf8_mul_36_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_36 = { +    8, +    { 6, 7, 4, 1, 2, 3, 0, 5, }, +    ec_gf8_mul_36_ops +}; + +static ec_gf_op_t ec_gf8_mul_37_ops[] = { +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR3,   8,  0,  1 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_37 = { +    9, +    { 6, 7, 2, 1, 0, 3, 4, 5, 8, }, +    ec_gf8_mul_37_ops +}; + +static ec_gf_op_t ec_gf8_mul_38_ops[] = { +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  6,  7 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_38 = { +    9, +    { 4, 5, 6, 3, 0, 1, 7, 2, 8, }, +    ec_gf8_mul_38_ops +}; + +static ec_gf_op_t ec_gf8_mul_39_ops[] = { +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_39 = { +    8, +    { 1, 6, 3, 0, 5, 2, 4, 7, }, +    ec_gf8_mul_39_ops +}; + +static ec_gf_op_t ec_gf8_mul_3A_ops[] = { +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3A = { +    8, +    { 3, 4, 7, 0, 5, 6, 1, 2, }, +    ec_gf8_mul_3A_ops +}; + +static ec_gf_op_t ec_gf8_mul_3B_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR3,   8,  7,  3 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3B = { +    9, +    { 3, 0, 1, 7, 6, 2, 4, 8, 5, }, +    ec_gf8_mul_3B_ops +}; + +static ec_gf_op_t ec_gf8_mul_3C_ops[] = { +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3C = { +    8, +    { 3, 6, 4, 1, 7, 2, 0, 5, }, +    ec_gf8_mul_3C_ops +}; + +static ec_gf_op_t ec_gf8_mul_3D_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3D = { +    8, +    { 2, 3, 4, 5, 6, 7, 0, 1, }, +    ec_gf8_mul_3D_ops +}; + +static ec_gf_op_t ec_gf8_mul_3E_ops[] = { +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3E = { +    8, +    { 6, 1, 2, 7, 0, 3, 5, 4, }, +    ec_gf8_mul_3E_ops +}; + +static ec_gf_op_t ec_gf8_mul_3F_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_COPY,  10,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_COPY,   9,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR3,   4,  9,  7 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   3, 10,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_3F = { +    11, +    { 1, 7, 6, 2, 4, 3, 5, 0, 8, 9, 10, }, +    ec_gf8_mul_3F_ops +}; + +static ec_gf_op_t ec_gf8_mul_40_ops[] = { +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR3,   8,  7,  6 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_40 = { +    9, +    { 5, 7, 4, 6, 2, 3, 0, 1, 8, }, +    ec_gf8_mul_40_ops +}; + +static ec_gf_op_t ec_gf8_mul_41_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_41 = { +    9, +    { 0, 7, 6, 5, 3, 4, 8, 1, 2, }, +    ec_gf8_mul_41_ops +}; + +static ec_gf_op_t ec_gf8_mul_42_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_42 = { +    9, +    { 2, 7, 1, 6, 4, 3, 0, 5, 8, }, +    ec_gf8_mul_42_ops +}; + +static ec_gf_op_t ec_gf8_mul_43_ops[] = { +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_43 = { +    8, +    { 2, 6, 4, 1, 7, 3, 0, 5, }, +    ec_gf8_mul_43_ops +}; + +static ec_gf_op_t ec_gf8_mul_44_ops[] = { +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_44 = { +    8, +    { 2, 3, 4, 1, 6, 5, 0, 7, }, +    ec_gf8_mul_44_ops +}; + +static ec_gf_op_t ec_gf8_mul_45_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_45 = { +    8, +    { 2, 3, 0, 1, 7, 4, 5, 6, }, +    ec_gf8_mul_45_ops +}; + +static ec_gf_op_t ec_gf8_mul_46_ops[] = { +    { EC_GF_OP_XOR3,   8,  2,  4 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_46 = { +    9, +    { 2, 0, 1, 3, 4, 5, 6, 7, 8, }, +    ec_gf8_mul_46_ops +}; + +static ec_gf_op_t ec_gf8_mul_47_ops[] = { +    { EC_GF_OP_XOR3,   8,  0,  1 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_47 = { +    9, +    { 2, 3, 4, 5, 6, 7, 0, 1, 8, }, +    ec_gf8_mul_47_ops +}; + +static ec_gf_op_t ec_gf8_mul_48_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_48 = { +    8, +    { 4, 5, 6, 0, 1, 3, 7, 2, }, +    ec_gf8_mul_48_ops +}; + +static ec_gf_op_t ec_gf8_mul_49_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR3,   8,  0,  6 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR3,   1,  8,  5 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_49 = { +    9, +    { 7, 2, 4, 0, 3, 5, 1, 6, 8, }, +    ec_gf8_mul_49_ops +}; + +static ec_gf_op_t ec_gf8_mul_4A_ops[] = { +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4A = { +    8, +    { 5, 6, 7, 0, 1, 3, 4, 2, }, +    ec_gf8_mul_4A_ops +}; + +static ec_gf_op_t ec_gf8_mul_4B_ops[] = { +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  7 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4B = { +    9, +    { 5, 3, 6, 7, 0, 2, 4, 1, 8, }, +    ec_gf8_mul_4B_ops +}; + +static ec_gf_op_t ec_gf8_mul_4C_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4C = { +    8, +    { 5, 3, 4, 7, 0, 6, 2, 1, }, +    ec_gf8_mul_4C_ops +}; + +static ec_gf_op_t ec_gf8_mul_4D_ops[] = { +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR3,   9,  3,  1 }, +    { EC_GF_OP_XOR2,   5,  9,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR3,   0,  8,  2 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4D = { +    10, +    { 0, 9, 3, 5, 6, 4, 7, 1, 2, 8, }, +    ec_gf8_mul_4D_ops +}; + +static ec_gf_op_t ec_gf8_mul_4E_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4E = { +    8, +    { 2, 3, 0, 1, 5, 6, 7, 4, }, +    ec_gf8_mul_4E_ops +}; + +static ec_gf_op_t ec_gf8_mul_4F_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_4F = { +    8, +    { 0, 3, 5, 6, 1, 2, 7, 4, }, +    ec_gf8_mul_4F_ops +}; + +static ec_gf_op_t ec_gf8_mul_50_ops[] = { +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_50 = { +    8, +    { 4, 5, 7, 3, 0, 1, 2, 6, }, +    ec_gf8_mul_50_ops +}; + +static ec_gf_op_t ec_gf8_mul_51_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_51 = { +    8, +    { 0, 1, 7, 2, 3, 4, 5, 6, }, +    ec_gf8_mul_51_ops +}; + +static ec_gf_op_t ec_gf8_mul_52_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_COPY,   9,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR3,   3,  5,  8 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  9,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_52 = { +    10, +    { 2, 3, 1, 4, 6, 7, 0, 5, 8, 9, }, +    ec_gf8_mul_52_ops +}; + +static ec_gf_op_t ec_gf8_mul_53_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_53 = { +    8, +    { 2, 0, 1, 4, 5, 6, 7, 3, }, +    ec_gf8_mul_53_ops +}; + +static ec_gf_op_t ec_gf8_mul_54_ops[] = { +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_54 = { +    8, +    { 7, 3, 0, 4, 2, 6, 5, 1, }, +    ec_gf8_mul_54_ops +}; + +static ec_gf_op_t ec_gf8_mul_55_ops[] = { +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_55 = { +    8, +    { 1, 5, 6, 4, 3, 7, 2, 0, }, +    ec_gf8_mul_55_ops +}; + +static ec_gf_op_t ec_gf8_mul_56_ops[] = { +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_56 = { +    8, +    { 2, 3, 0, 4, 5, 6, 7, 1, }, +    ec_gf8_mul_56_ops +}; + +static ec_gf_op_t ec_gf8_mul_57_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_57 = { +    8, +    { 2, 3, 0, 1, 4, 5, 6, 7, }, +    ec_gf8_mul_57_ops +}; + +static ec_gf_op_t ec_gf8_mul_58_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_58 = { +    8, +    { 4, 3, 2, 7, 0, 1, 5, 6, }, +    ec_gf8_mul_58_ops +}; + +static ec_gf_op_t ec_gf8_mul_59_ops[] = { +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_59 = { +    8, +    { 7, 3, 5, 6, 1, 2, 0, 4, }, +    ec_gf8_mul_59_ops +}; + +static ec_gf_op_t ec_gf8_mul_5A_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5A = { +    8, +    { 6, 7, 0, 1, 2, 3, 5, 4, }, +    ec_gf8_mul_5A_ops +}; + +static ec_gf_op_t ec_gf8_mul_5B_ops[] = { +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5B = { +    8, +    { 6, 0, 7, 5, 2, 1, 3, 4, }, +    ec_gf8_mul_5B_ops +}; + +static ec_gf_op_t ec_gf8_mul_5C_ops[] = { +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5C = { +    9, +    { 7, 5, 2, 4, 1, 0, 6, 3, 8, }, +    ec_gf8_mul_5C_ops +}; + +static ec_gf_op_t ec_gf8_mul_5D_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5D = { +    8, +    { 1, 3, 5, 4, 6, 7, 2, 0, }, +    ec_gf8_mul_5D_ops +}; + +static ec_gf_op_t ec_gf8_mul_5E_ops[] = { +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5E = { +    8, +    { 4, 3, 6, 2, 5, 7, 0, 1, }, +    ec_gf8_mul_5E_ops +}; + +static ec_gf_op_t ec_gf8_mul_5F_ops[] = { +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_5F = { +    8, +    { 6, 1, 3, 4, 5, 7, 2, 0, }, +    ec_gf8_mul_5F_ops +}; + +static ec_gf_op_t ec_gf8_mul_60_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_60 = { +    8, +    { 2, 3, 4, 7, 5, 6, 0, 1, }, +    ec_gf8_mul_60_ops +}; + +static ec_gf_op_t ec_gf8_mul_61_ops[] = { +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_61 = { +    8, +    { 0, 5, 6, 7, 4, 2, 1, 3, }, +    ec_gf8_mul_61_ops +}; + +static ec_gf_op_t ec_gf8_mul_62_ops[] = { +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_62 = { +    8, +    { 2, 0, 3, 4, 5, 6, 7, 1, }, +    ec_gf8_mul_62_ops +}; + +static ec_gf_op_t ec_gf8_mul_63_ops[] = { +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_63 = { +    8, +    { 3, 4, 6, 5, 7, 0, 1, 2, }, +    ec_gf8_mul_63_ops +}; + +static ec_gf_op_t ec_gf8_mul_64_ops[] = { +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_64 = { +    9, +    { 2, 3, 4, 6, 5, 7, 8, 1, 0, }, +    ec_gf8_mul_64_ops +}; + +static ec_gf_op_t ec_gf8_mul_65_ops[] = { +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_65 = { +    8, +    { 2, 5, 1, 3, 4, 0, 6, 7, }, +    ec_gf8_mul_65_ops +}; + +static ec_gf_op_t ec_gf8_mul_66_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_66 = { +    8, +    { 2, 3, 1, 4, 5, 7, 0, 6, }, +    ec_gf8_mul_66_ops +}; + +static ec_gf_op_t ec_gf8_mul_67_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_67 = { +    8, +    { 2, 4, 5, 6, 7, 3, 1, 0, }, +    ec_gf8_mul_67_ops +}; + +static ec_gf_op_t ec_gf8_mul_68_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_68 = { +    8, +    { 5, 7, 2, 3, 0, 6, 4, 1, }, +    ec_gf8_mul_68_ops +}; + +static ec_gf_op_t ec_gf8_mul_69_ops[] = { +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_69 = { +    8, +    { 0, 1, 3, 2, 4, 5, 7, 6, }, +    ec_gf8_mul_69_ops +}; + +static ec_gf_op_t ec_gf8_mul_6A_ops[] = { +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6A = { +    8, +    { 5, 7, 4, 6, 1, 2, 0, 3, }, +    ec_gf8_mul_6A_ops +}; + +static ec_gf_op_t ec_gf8_mul_6B_ops[] = { +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6B = { +    9, +    { 6, 7, 2, 0, 3, 1, 5, 4, 8, }, +    ec_gf8_mul_6B_ops +}; + +static ec_gf_op_t ec_gf8_mul_6C_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6C = { +    8, +    { 5, 6, 7, 0, 1, 2, 3, 4, }, +    ec_gf8_mul_6C_ops +}; + +static ec_gf_op_t ec_gf8_mul_6D_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  4 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6D = { +    9, +    { 3, 6, 7, 0, 4, 5, 1, 2, 8, }, +    ec_gf8_mul_6D_ops +}; + +static ec_gf_op_t ec_gf8_mul_6E_ops[] = { +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6E = { +    8, +    { 5, 6, 3, 1, 7, 2, 0, 4, }, +    ec_gf8_mul_6E_ops +}; + +static ec_gf_op_t ec_gf8_mul_6F_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR3,   0,  8,  7 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_6F = { +    9, +    { 2, 6, 3, 7, 0, 1, 4, 5, 8, }, +    ec_gf8_mul_6F_ops +}; + +static ec_gf_op_t ec_gf8_mul_70_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_70 = { +    8, +    { 3, 4, 5, 2, 6, 0, 1, 7, }, +    ec_gf8_mul_70_ops +}; + +static ec_gf_op_t ec_gf8_mul_71_ops[] = { +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_71 = { +    8, +    { 4, 7, 5, 3, 6, 0, 2, 1, }, +    ec_gf8_mul_71_ops +}; + +static ec_gf_op_t ec_gf8_mul_72_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_72 = { +    8, +    { 0, 5, 2, 7, 4, 1, 3, 6, }, +    ec_gf8_mul_72_ops +}; + +static ec_gf_op_t ec_gf8_mul_73_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_73 = { +    8, +    { 6, 0, 1, 7, 4, 5, 2, 3, }, +    ec_gf8_mul_73_ops +}; + +static ec_gf_op_t ec_gf8_mul_74_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_74 = { +    8, +    { 3, 2, 1, 0, 4, 5, 6, 7, }, +    ec_gf8_mul_74_ops +}; + +static ec_gf_op_t ec_gf8_mul_75_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_75 = { +    8, +    { 4, 5, 6, 7, 0, 1, 2, 3, }, +    ec_gf8_mul_75_ops +}; + +static ec_gf_op_t ec_gf8_mul_76_ops[] = { +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR3,   8,  6,  2 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_76 = { +    9, +    { 2, 3, 0, 6, 5, 1, 7, 8, 4, }, +    ec_gf8_mul_76_ops +}; + +static ec_gf_op_t ec_gf8_mul_77_ops[] = { +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_77 = { +    8, +    { 7, 4, 3, 6, 0, 1, 5, 2, }, +    ec_gf8_mul_77_ops +}; + +static ec_gf_op_t ec_gf8_mul_78_ops[] = { +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR3,   8,  0,  2 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_78 = { +    9, +    { 4, 7, 3, 2, 5, 1, 6, 0, 8, }, +    ec_gf8_mul_78_ops +}; + +static ec_gf_op_t ec_gf8_mul_79_ops[] = { +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR3,   8,  4,  7 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_79 = { +    9, +    { 4, 5, 7, 3, 1, 6, 2, 0, 8, }, +    ec_gf8_mul_79_ops +}; + +static ec_gf_op_t ec_gf8_mul_7A_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7A = { +    8, +    { 1, 2, 3, 4, 5, 6, 7, 0, }, +    ec_gf8_mul_7A_ops +}; + +static ec_gf_op_t ec_gf8_mul_7B_ops[] = { +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  5,  3 }, +    { EC_GF_OP_XOR2,   8,  0,  0 }, +    { EC_GF_OP_COPY,   9,  4,  0 }, +    { EC_GF_OP_XOR2,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR3,   4,  1,  9 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7B = { +    10, +    { 1, 2, 3, 4, 8, 5, 6, 0, 7, 9, }, +    ec_gf8_mul_7B_ops +}; + +static ec_gf_op_t ec_gf8_mul_7C_ops[] = { +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7C = { +    8, +    { 2, 4, 1, 6, 3, 5, 7, 0, }, +    ec_gf8_mul_7C_ops +}; + +static ec_gf_op_t ec_gf8_mul_7D_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7D = { +    8, +    { 1, 0, 3, 5, 6, 7, 2, 4, }, +    ec_gf8_mul_7D_ops +}; + +static ec_gf_op_t ec_gf8_mul_7E_ops[] = { +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR3,   6,  2,  7 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7E = { +    9, +    { 5, 1, 2, 0, 7, 3, 4, 6, 8, }, +    ec_gf8_mul_7E_ops +}; + +static ec_gf_op_t ec_gf8_mul_7F_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR3,   9,  7,  5 }, +    { EC_GF_OP_XOR2,   2,  9,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  9,  0 }, +    { EC_GF_OP_XOR3,   9,  6,  4 }, +    { EC_GF_OP_XOR2,   7,  9,  0 }, +    { EC_GF_OP_XOR2,   3,  9,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_7F = { +    10, +    { 4, 1, 0, 5, 6, 7, 2, 3, 8, 9, }, +    ec_gf8_mul_7F_ops +}; + +static ec_gf_op_t ec_gf8_mul_80_ops[] = { +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_80 = { +    8, +    { 7, 5, 6, 4, 1, 2, 3, 0, }, +    ec_gf8_mul_80_ops +}; + +static ec_gf_op_t ec_gf8_mul_81_ops[] = { +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_81 = { +    8, +    { 2, 7, 4, 1, 5, 6, 3, 0, }, +    ec_gf8_mul_81_ops +}; + +static ec_gf_op_t ec_gf8_mul_82_ops[] = { +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_COPY,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR3,   5,  8,  7 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_82 = { +    9, +    { 6, 2, 7, 5, 1, 3, 4, 0, 8, }, +    ec_gf8_mul_82_ops +}; + +static ec_gf_op_t ec_gf8_mul_83_ops[] = { +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_83 = { +    8, +    { 3, 5, 6, 7, 1, 2, 4, 0, }, +    ec_gf8_mul_83_ops +}; + +static ec_gf_op_t ec_gf8_mul_84_ops[] = { +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_84 = { +    8, +    { 7, 6, 0, 4, 1, 5, 3, 2, }, +    ec_gf8_mul_84_ops +}; + +static ec_gf_op_t ec_gf8_mul_85_ops[] = { +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_85 = { +    8, +    { 7, 6, 0, 3, 2, 4, 5, 1, }, +    ec_gf8_mul_85_ops +}; + +static ec_gf_op_t ec_gf8_mul_86_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_86 = { +    8, +    { 1, 2, 6, 4, 5, 7, 3, 0, }, +    ec_gf8_mul_86_ops +}; + +static ec_gf_op_t ec_gf8_mul_87_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR3,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_87 = { +    9, +    { 1, 2, 3, 4, 5, 7, 6, 0, 8, }, +    ec_gf8_mul_87_ops +}; + +static ec_gf_op_t ec_gf8_mul_88_ops[] = { +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_88 = { +    8, +    { 6, 7, 3, 1, 2, 4, 5, 0, }, +    ec_gf8_mul_88_ops +}; + +static ec_gf_op_t ec_gf8_mul_89_ops[] = { +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  5,  2 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_89 = { +    9, +    { 2, 1, 6, 5, 7, 3, 4, 0, 8, }, +    ec_gf8_mul_89_ops +}; + +static ec_gf_op_t ec_gf8_mul_8A_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8A = { +    8, +    { 1, 2, 3, 0, 6, 7, 4, 5, }, +    ec_gf8_mul_8A_ops +}; + +static ec_gf_op_t ec_gf8_mul_8B_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8B = { +    8, +    { 6, 1, 2, 3, 5, 7, 4, 0, }, +    ec_gf8_mul_8B_ops +}; + +static ec_gf_op_t ec_gf8_mul_8C_ops[] = { +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8C = { +    8, +    { 1, 2, 0, 7, 3, 4, 5, 6, }, +    ec_gf8_mul_8C_ops +}; + +static ec_gf_op_t ec_gf8_mul_8D_ops[] = { +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8D = { +    8, +    { 7, 1, 3, 2, 4, 5, 0, 6, }, +    ec_gf8_mul_8D_ops +}; + +static ec_gf_op_t ec_gf8_mul_8E_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8E = { +    8, +    { 1, 2, 3, 4, 5, 6, 7, 0, }, +    ec_gf8_mul_8E_ops +}; + +static ec_gf_op_t ec_gf8_mul_8F_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_8F = { +    8, +    { 1, 2, 3, 4, 5, 6, 7, 0, }, +    ec_gf8_mul_8F_ops +}; + +static ec_gf_op_t ec_gf8_mul_90_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_90 = { +    8, +    { 4, 5, 6, 7, 0, 1, 3, 2, }, +    ec_gf8_mul_90_ops +}; + +static ec_gf_op_t ec_gf8_mul_91_ops[] = { +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_COPY,   9,  1,  0 }, +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  9,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR3,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_91 = { +    10, +    { 2, 3, 1, 4, 0, 6, 7, 5, 8, 9, }, +    ec_gf8_mul_91_ops +}; + +static ec_gf_op_t ec_gf8_mul_92_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_92 = { +    8, +    { 6, 7, 0, 1, 2, 3, 5, 4, }, +    ec_gf8_mul_92_ops +}; + +static ec_gf_op_t ec_gf8_mul_93_ops[] = { +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_93 = { +    8, +    { 6, 4, 5, 1, 7, 2, 3, 0, }, +    ec_gf8_mul_93_ops +}; + +static ec_gf_op_t ec_gf8_mul_94_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_94 = { +    8, +    { 7, 5, 0, 2, 6, 1, 3, 4, }, +    ec_gf8_mul_94_ops +}; + +static ec_gf_op_t ec_gf8_mul_95_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_95 = { +    8, +    { 7, 6, 1, 3, 0, 4, 5, 2, }, +    ec_gf8_mul_95_ops +}; + +static ec_gf_op_t ec_gf8_mul_96_ops[] = { +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR3,   8,  0,  4 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_96 = { +    9, +    { 4, 0, 1, 6, 7, 2, 3, 5, 8, }, +    ec_gf8_mul_96_ops +}; + +static ec_gf_op_t ec_gf8_mul_97_ops[] = { +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_COPY,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_97 = { +    9, +    { 4, 5, 3, 6, 7, 1, 2, 0, 8, }, +    ec_gf8_mul_97_ops +}; + +static ec_gf_op_t ec_gf8_mul_98_ops[] = { +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_98 = { +    8, +    { 4, 2, 3, 6, 7, 5, 1, 0, }, +    ec_gf8_mul_98_ops +}; + +static ec_gf_op_t ec_gf8_mul_99_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_99 = { +    8, +    { 6, 5, 3, 7, 0, 1, 4, 2, }, +    ec_gf8_mul_99_ops +}; + +static ec_gf_op_t ec_gf8_mul_9A_ops[] = { +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR3,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9A = { +    9, +    { 6, 3, 4, 0, 5, 1, 2, 7, 8, }, +    ec_gf8_mul_9A_ops +}; + +static ec_gf_op_t ec_gf8_mul_9B_ops[] = { +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_COPY,   9,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  2 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  9,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9B = { +    10, +    { 4, 5, 8, 6, 7, 1, 2, 0, 3, 9, }, +    ec_gf8_mul_9B_ops +}; + +static ec_gf_op_t ec_gf8_mul_9C_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9C = { +    8, +    { 3, 2, 1, 0, 4, 5, 6, 7, }, +    ec_gf8_mul_9C_ops +}; + +static ec_gf_op_t ec_gf8_mul_9D_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9D = { +    8, +    { 0, 1, 2, 3, 7, 4, 5, 6, }, +    ec_gf8_mul_9D_ops +}; + +static ec_gf_op_t ec_gf8_mul_9E_ops[] = { +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_COPY,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9E = { +    9, +    { 4, 5, 3, 8, 6, 0, 2, 7, 1, }, +    ec_gf8_mul_9E_ops +}; + +static ec_gf_op_t ec_gf8_mul_9F_ops[] = { +    { EC_GF_OP_XOR3,   8,  1,  2 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_9F = { +    9, +    { 4, 5, 6, 7, 0, 1, 2, 3, 8, }, +    ec_gf8_mul_9F_ops +}; + +static ec_gf_op_t ec_gf8_mul_A0_ops[] = { +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A0 = { +    8, +    { 3, 1, 6, 7, 5, 2, 4, 0, }, +    ec_gf8_mul_A0_ops +}; + +static ec_gf_op_t ec_gf8_mul_A1_ops[] = { +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR3,   8,  0,  6 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A1 = { +    9, +    { 7, 4, 1, 5, 6, 0, 2, 3, 8, }, +    ec_gf8_mul_A1_ops +}; + +static ec_gf_op_t ec_gf8_mul_A2_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A2 = { +    8, +    { 7, 0, 6, 3, 2, 1, 4, 5, }, +    ec_gf8_mul_A2_ops +}; + +static ec_gf_op_t ec_gf8_mul_A3_ops[] = { +    { EC_GF_OP_COPY,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A3 = { +    9, +    { 3, 7, 2, 6, 1, 4, 0, 5, 8, }, +    ec_gf8_mul_A3_ops +}; + +static ec_gf_op_t ec_gf8_mul_A4_ops[] = { +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A4 = { +    8, +    { 5, 6, 7, 2, 4, 3, 0, 1, }, +    ec_gf8_mul_A4_ops +}; + +static ec_gf_op_t ec_gf8_mul_A5_ops[] = { +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR3,   8,  5,  6 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A5 = { +    9, +    { 1, 4, 2, 5, 6, 7, 3, 0, 8, }, +    ec_gf8_mul_A5_ops +}; + +static ec_gf_op_t ec_gf8_mul_A6_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A6 = { +    8, +    { 1, 2, 0, 3, 4, 5, 6, 7, }, +    ec_gf8_mul_A6_ops +}; + +static ec_gf_op_t ec_gf8_mul_A7_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A7 = { +    8, +    { 0, 1, 2, 5, 6, 7, 3, 4, }, +    ec_gf8_mul_A7_ops +}; + +static ec_gf_op_t ec_gf8_mul_A8_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_COPY,   9,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  9,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A8 = { +    10, +    { 1, 7, 5, 8, 6, 3, 4, 0, 2, 9, }, +    ec_gf8_mul_A8_ops +}; + +static ec_gf_op_t ec_gf8_mul_A9_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_A9 = { +    8, +    { 3, 7, 6, 1, 2, 0, 4, 5, }, +    ec_gf8_mul_A9_ops +}; + +static ec_gf_op_t ec_gf8_mul_AA_ops[] = { +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AA = { +    8, +    { 0, 4, 5, 3, 6, 7, 1, 2, }, +    ec_gf8_mul_AA_ops +}; + +static ec_gf_op_t ec_gf8_mul_AB_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_COPY,   9,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR3,   3,  9,  7 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AB = { +    10, +    { 2, 3, 8, 0, 5, 6, 1, 4, 7, 9, }, +    ec_gf8_mul_AB_ops +}; + +static ec_gf_op_t ec_gf8_mul_AC_ops[] = { +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AC = { +    8, +    { 3, 2, 1, 0, 4, 5, 6, 7, }, +    ec_gf8_mul_AC_ops +}; + +static ec_gf_op_t ec_gf8_mul_AD_ops[] = { +    { EC_GF_OP_XOR3,   8,  1,  2 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AD = { +    9, +    { 3, 4, 5, 6, 7, 0, 1, 2, 8, }, +    ec_gf8_mul_AD_ops +}; + +static ec_gf_op_t ec_gf8_mul_AE_ops[] = { +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_COPY,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AE = { +    9, +    { 7, 0, 5, 6, 3, 4, 1, 2, 8, }, +    ec_gf8_mul_AE_ops +}; + +static ec_gf_op_t ec_gf8_mul_AF_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_AF = { +    8, +    { 0, 1, 2, 7, 3, 4, 5, 6, }, +    ec_gf8_mul_AF_ops +}; + +static ec_gf_op_t ec_gf8_mul_B0_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B0 = { +    8, +    { 4, 0, 7, 2, 3, 1, 6, 5, }, +    ec_gf8_mul_B0_ops +}; + +static ec_gf_op_t ec_gf8_mul_B1_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_COPY,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR3,   5,  8,  1 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B1 = { +    9, +    { 2, 6, 4, 7, 0, 1, 3, 5, 8, }, +    ec_gf8_mul_B1_ops +}; + +static ec_gf_op_t ec_gf8_mul_B2_ops[] = { +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  4,  5 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B2 = { +    9, +    { 0, 7, 4, 5, 6, 1, 2, 3, 8, }, +    ec_gf8_mul_B2_ops +}; + +static ec_gf_op_t ec_gf8_mul_B3_ops[] = { +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_COPY,   9,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR3,   8,  6,  4 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR3,   1,  9,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B3 = { +    10, +    { 2, 3, 4, 5, 1, 6, 0, 7, 8, 9, }, +    ec_gf8_mul_B3_ops +}; + +static ec_gf_op_t ec_gf8_mul_B4_ops[] = { +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B4 = { +    8, +    { 5, 6, 7, 0, 1, 2, 3, 4, }, +    ec_gf8_mul_B4_ops +}; + +static ec_gf_op_t ec_gf8_mul_B5_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_COPY,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR3,   4,  8,  3 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B5 = { +    9, +    { 3, 4, 0, 7, 1, 5, 6, 2, 8, }, +    ec_gf8_mul_B5_ops +}; + +static ec_gf_op_t ec_gf8_mul_B6_ops[] = { +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B6 = { +    8, +    { 5, 3, 6, 4, 7, 0, 1, 2, }, +    ec_gf8_mul_B6_ops +}; + +static ec_gf_op_t ec_gf8_mul_B7_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B7 = { +    8, +    { 5, 0, 1, 4, 2, 6, 7, 3, }, +    ec_gf8_mul_B7_ops +}; + +static ec_gf_op_t ec_gf8_mul_B8_ops[] = { +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B8 = { +    8, +    { 6, 4, 5, 1, 2, 0, 7, 3, }, +    ec_gf8_mul_B8_ops +}; + +static ec_gf_op_t ec_gf8_mul_B9_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR3,   0,  8,  2 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_B9 = { +    9, +    { 6, 7, 0, 2, 1, 4, 5, 3, 8, }, +    ec_gf8_mul_B9_ops +}; + +static ec_gf_op_t ec_gf8_mul_BA_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BA = { +    8, +    { 1, 2, 4, 3, 5, 6, 0, 7, }, +    ec_gf8_mul_BA_ops +}; + +static ec_gf_op_t ec_gf8_mul_BB_ops[] = { +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BB = { +    9, +    { 7, 2, 1, 8, 3, 5, 6, 4, 0, }, +    ec_gf8_mul_BB_ops +}; + +static ec_gf_op_t ec_gf8_mul_BC_ops[] = { +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR3,   2,  8,  4 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BC = { +    9, +    { 2, 6, 3, 4, 5, 1, 7, 0, 8, }, +    ec_gf8_mul_BC_ops +}; + +static ec_gf_op_t ec_gf8_mul_BD_ops[] = { +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BD = { +    8, +    { 4, 5, 0, 2, 7, 1, 6, 3, }, +    ec_gf8_mul_BD_ops +}; + +static ec_gf_op_t ec_gf8_mul_BE_ops[] = { +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BE = { +    8, +    { 0, 6, 7, 4, 5, 1, 3, 2, }, +    ec_gf8_mul_BE_ops +}; + +static ec_gf_op_t ec_gf8_mul_BF_ops[] = { +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_BF = { +    8, +    { 5, 6, 1, 7, 3, 0, 2, 4, }, +    ec_gf8_mul_BF_ops +}; + +static ec_gf_op_t ec_gf8_mul_C0_ops[] = { +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C0 = { +    8, +    { 1, 2, 3, 4, 7, 5, 6, 0, }, +    ec_gf8_mul_C0_ops +}; + +static ec_gf_op_t ec_gf8_mul_C1_ops[] = { +    { EC_GF_OP_XOR3,   8,  1,  2 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C1 = { +    9, +    { 5, 6, 7, 4, 1, 2, 3, 0, 8, }, +    ec_gf8_mul_C1_ops +}; + +static ec_gf_op_t ec_gf8_mul_C2_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C2 = { +    8, +    { 7, 6, 3, 0, 1, 4, 5, 2, }, +    ec_gf8_mul_C2_ops +}; + +static ec_gf_op_t ec_gf8_mul_C3_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR3,   0,  2,  6 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR3,   9,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  9,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C3 = { +    10, +    { 5, 6, 4, 7, 1, 2, 3, 0, 8, 9, }, +    ec_gf8_mul_C3_ops +}; + +static ec_gf_op_t ec_gf8_mul_C4_ops[] = { +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C4 = { +    8, +    { 0, 2, 1, 3, 4, 5, 6, 7, }, +    ec_gf8_mul_C4_ops +}; + +static ec_gf_op_t ec_gf8_mul_C5_ops[] = { +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C5 = { +    8, +    { 4, 3, 5, 7, 6, 2, 0, 1, }, +    ec_gf8_mul_C5_ops +}; + +static ec_gf_op_t ec_gf8_mul_C6_ops[] = { +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_COPY,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR3,   9,  5,  4 }, +    { EC_GF_OP_XOR2,   6,  9,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  9,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C6 = { +    10, +    { 6, 3, 0, 4, 5, 7, 2, 1, 8, 9, }, +    ec_gf8_mul_C6_ops +}; + +static ec_gf_op_t ec_gf8_mul_C7_ops[] = { +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C7 = { +    8, +    { 7, 0, 6, 2, 5, 3, 4, 1, }, +    ec_gf8_mul_C7_ops +}; + +static ec_gf_op_t ec_gf8_mul_C8_ops[] = { +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C8 = { +    8, +    { 1, 3, 2, 4, 6, 7, 5, 0, }, +    ec_gf8_mul_C8_ops +}; + +static ec_gf_op_t ec_gf8_mul_C9_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_C9 = { +    8, +    { 2, 3, 4, 5, 6, 7, 0, 1, }, +    ec_gf8_mul_C9_ops +}; + +static ec_gf_op_t ec_gf8_mul_CA_ops[] = { +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CA = { +    8, +    { 1, 2, 5, 7, 3, 4, 0, 6, }, +    ec_gf8_mul_CA_ops +}; + +static ec_gf_op_t ec_gf8_mul_CB_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CB = { +    8, +    { 2, 3, 4, 5, 7, 6, 0, 1, }, +    ec_gf8_mul_CB_ops +}; + +static ec_gf_op_t ec_gf8_mul_CC_ops[] = { +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CC = { +    8, +    { 2, 7, 1, 0, 5, 6, 3, 4, }, +    ec_gf8_mul_CC_ops +}; + +static ec_gf_op_t ec_gf8_mul_CD_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CD = { +    8, +    { 0, 6, 1, 2, 7, 3, 4, 5, }, +    ec_gf8_mul_CD_ops +}; + +static ec_gf_op_t ec_gf8_mul_CE_ops[] = { +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_COPY,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR3,   3,  6,  8 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  2,  3 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CE = { +    9, +    { 5, 7, 3, 0, 2, 6, 4, 1, 8, }, +    ec_gf8_mul_CE_ops +}; + +static ec_gf_op_t ec_gf8_mul_CF_ops[] = { +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_CF = { +    8, +    { 3, 6, 7, 0, 2, 4, 5, 1, }, +    ec_gf8_mul_CF_ops +}; + +static ec_gf_op_t ec_gf8_mul_D0_ops[] = { +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D0 = { +    8, +    { 5, 6, 7, 2, 0, 3, 1, 4, }, +    ec_gf8_mul_D0_ops +}; + +static ec_gf_op_t ec_gf8_mul_D1_ops[] = { +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR3,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D1 = { +    9, +    { 5, 6, 3, 2, 0, 7, 4, 1, 8, }, +    ec_gf8_mul_D1_ops +}; + +static ec_gf_op_t ec_gf8_mul_D2_ops[] = { +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D2 = { +    8, +    { 7, 0, 2, 1, 3, 4, 6, 5, }, +    ec_gf8_mul_D2_ops +}; + +static ec_gf_op_t ec_gf8_mul_D3_ops[] = { +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_COPY,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D3 = { +    9, +    { 0, 3, 2, 8, 4, 6, 7, 1, 5, }, +    ec_gf8_mul_D3_ops +}; + +static ec_gf_op_t ec_gf8_mul_D4_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR3,   1,  7,  8 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D4 = { +    9, +    { 4, 1, 7, 5, 0, 6, 3, 2, 8, }, +    ec_gf8_mul_D4_ops +}; + +static ec_gf_op_t ec_gf8_mul_D5_ops[] = { +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D5 = { +    8, +    { 6, 7, 4, 5, 2, 3, 1, 0, }, +    ec_gf8_mul_D5_ops +}; + +static ec_gf_op_t ec_gf8_mul_D6_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D6 = { +    9, +    { 0, 6, 2, 7, 1, 3, 4, 5, 8, }, +    ec_gf8_mul_D6_ops +}; + +static ec_gf_op_t ec_gf8_mul_D7_ops[] = { +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  5 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR3,   6,  7,  8 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D7 = { +    9, +    { 3, 4, 6, 5, 0, 7, 1, 2, 8, }, +    ec_gf8_mul_D7_ops +}; + +static ec_gf_op_t ec_gf8_mul_D8_ops[] = { +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D8 = { +    8, +    { 4, 5, 6, 7, 0, 1, 2, 3, }, +    ec_gf8_mul_D8_ops +}; + +static ec_gf_op_t ec_gf8_mul_D9_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_D9 = { +    8, +    { 1, 2, 6, 7, 4, 5, 0, 3, }, +    ec_gf8_mul_D9_ops +}; + +static ec_gf_op_t ec_gf8_mul_DA_ops[] = { +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR3,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DA = { +    9, +    { 2, 5, 7, 1, 0, 4, 3, 6, 8, }, +    ec_gf8_mul_DA_ops +}; + +static ec_gf_op_t ec_gf8_mul_DB_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DB = { +    9, +    { 7, 5, 6, 2, 3, 4, 1, 0, 8, }, +    ec_gf8_mul_DB_ops +}; + +static ec_gf_op_t ec_gf8_mul_DC_ops[] = { +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DC = { +    8, +    { 4, 5, 2, 6, 7, 1, 0, 3, }, +    ec_gf8_mul_DC_ops +}; + +static ec_gf_op_t ec_gf8_mul_DD_ops[] = { +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DD = { +    8, +    { 1, 2, 3, 6, 7, 0, 4, 5, }, +    ec_gf8_mul_DD_ops +}; + +static ec_gf_op_t ec_gf8_mul_DE_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DE = { +    8, +    { 0, 5, 2, 6, 7, 1, 3, 4, }, +    ec_gf8_mul_DE_ops +}; + +static ec_gf_op_t ec_gf8_mul_DF_ops[] = { +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_COPY,   9,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR3,   1,  9,  2 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_DF = { +    10, +    { 7, 2, 8, 4, 3, 1, 0, 6, 5, 9, }, +    ec_gf8_mul_DF_ops +}; + +static ec_gf_op_t ec_gf8_mul_E0_ops[] = { +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E0 = { +    8, +    { 2, 3, 4, 7, 5, 6, 0, 1, }, +    ec_gf8_mul_E0_ops +}; + +static ec_gf_op_t ec_gf8_mul_E1_ops[] = { +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR3,   9,  5,  3 }, +    { EC_GF_OP_XOR2,   0,  9,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  9,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E1 = { +    10, +    { 0, 7, 1, 3, 4, 5, 6, 2, 8, 9, }, +    ec_gf8_mul_E1_ops +}; + +static ec_gf_op_t ec_gf8_mul_E2_ops[] = { +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E2 = { +    8, +    { 2, 3, 7, 1, 5, 6, 0, 4, }, +    ec_gf8_mul_E2_ops +}; + +static ec_gf_op_t ec_gf8_mul_E3_ops[] = { +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  2,  7 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR3,   6,  8,  4 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E3 = { +    9, +    { 5, 4, 7, 2, 1, 3, 6, 0, 8, }, +    ec_gf8_mul_E3_ops +}; + +static ec_gf_op_t ec_gf8_mul_E4_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E4 = { +    8, +    { 7, 0, 1, 6, 3, 4, 2, 5, }, +    ec_gf8_mul_E4_ops +}; + +static ec_gf_op_t ec_gf8_mul_E5_ops[] = { +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E5 = { +    9, +    { 4, 5, 3, 6, 7, 1, 0, 2, 8, }, +    ec_gf8_mul_E5_ops +}; + +static ec_gf_op_t ec_gf8_mul_E6_ops[] = { +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E6 = { +    8, +    { 5, 4, 3, 6, 7, 0, 1, 2, }, +    ec_gf8_mul_E6_ops +}; + +static ec_gf_op_t ec_gf8_mul_E7_ops[] = { +    { EC_GF_OP_COPY,   8,  6,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR3,   9,  0,  6 }, +    { EC_GF_OP_XOR2,   4,  9,  0 }, +    { EC_GF_OP_XOR2,   5,  9,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E7 = { +    10, +    { 1, 4, 3, 6, 7, 5, 2, 0, 8, 9, }, +    ec_gf8_mul_E7_ops +}; + +static ec_gf_op_t ec_gf8_mul_E8_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E8 = { +    8, +    { 1, 4, 2, 7, 3, 0, 5, 6, }, +    ec_gf8_mul_E8_ops +}; + +static ec_gf_op_t ec_gf8_mul_E9_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_COPY,   8,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR3,   1,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_E9 = { +    9, +    { 6, 2, 0, 3, 4, 1, 5, 7, 8, }, +    ec_gf8_mul_E9_ops +}; + +static ec_gf_op_t ec_gf8_mul_EA_ops[] = { +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EA = { +    8, +    { 3, 4, 5, 6, 7, 0, 1, 2, }, +    ec_gf8_mul_EA_ops +}; + +static ec_gf_op_t ec_gf8_mul_EB_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EB = { +    8, +    { 3, 4, 5, 6, 7, 0, 1, 2, }, +    ec_gf8_mul_EB_ops +}; + +static ec_gf_op_t ec_gf8_mul_EC_ops[] = { +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EC = { +    9, +    { 7, 4, 3, 0, 2, 5, 1, 6, 8, }, +    ec_gf8_mul_EC_ops +}; + +static ec_gf_op_t ec_gf8_mul_ED_ops[] = { +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_ED = { +    8, +    { 5, 6, 7, 0, 1, 4, 3, 2, }, +    ec_gf8_mul_ED_ops +}; + +static ec_gf_op_t ec_gf8_mul_EE_ops[] = { +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  2,  3 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EE = { +    9, +    { 6, 4, 5, 7, 2, 3, 0, 1, 8, }, +    ec_gf8_mul_EE_ops +}; + +static ec_gf_op_t ec_gf8_mul_EF_ops[] = { +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_COPY,   8,  0,  0 }, +    { EC_GF_OP_XOR2,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   6,  8,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_EF = { +    9, +    { 6, 4, 5, 7, 2, 0, 3, 1, 8, }, +    ec_gf8_mul_EF_ops +}; + +static ec_gf_op_t ec_gf8_mul_F0_ops[] = { +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR3,   8,  3,  6 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   1,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F0 = { +    9, +    { 3, 4, 6, 1, 2, 0, 5, 7, 8, }, +    ec_gf8_mul_F0_ops +}; + +static ec_gf_op_t ec_gf8_mul_F1_ops[] = { +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_COPY,   9,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   9,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  9,  0 }, +    { EC_GF_OP_XOR2,   4,  9,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR3,   9,  8,  7 }, +    { EC_GF_OP_XOR2,   1,  9,  0 }, +    { EC_GF_OP_XOR2,   5,  9,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F1 = { +    10, +    { 7, 2, 6, 3, 5, 1, 4, 0, 8, 9, }, +    ec_gf8_mul_F1_ops +}; + +static ec_gf_op_t ec_gf8_mul_F2_ops[] = { +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_XOR3,   8,  6,  4 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F2 = { +    9, +    { 1, 0, 6, 7, 4, 5, 2, 3, 8, }, +    ec_gf8_mul_F2_ops +}; + +static ec_gf_op_t ec_gf8_mul_F3_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F3 = { +    8, +    { 5, 6, 7, 0, 1, 2, 3, 4, }, +    ec_gf8_mul_F3_ops +}; + +static ec_gf_op_t ec_gf8_mul_F4_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F4 = { +    8, +    { 0, 1, 2, 3, 4, 5, 6, 7, }, +    ec_gf8_mul_F4_ops +}; + +static ec_gf_op_t ec_gf8_mul_F5_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F5 = { +    8, +    { 7, 0, 1, 2, 3, 4, 5, 6, }, +    ec_gf8_mul_F5_ops +}; + +static ec_gf_op_t ec_gf8_mul_F6_ops[] = { +    { EC_GF_OP_XOR2,   3,  1,  0 }, +    { EC_GF_OP_COPY,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_COPY,   9,  3,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   9,  4,  0 }, +    { EC_GF_OP_XOR2,   4,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  9,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR3,   7,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F6 = { +    10, +    { 0, 6, 2, 7, 4, 3, 5, 9, 1, 8, }, +    ec_gf8_mul_F6_ops +}; + +static ec_gf_op_t ec_gf8_mul_F7_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F7 = { +    8, +    { 6, 7, 0, 1, 2, 3, 4, 5, }, +    ec_gf8_mul_F7_ops +}; + +static ec_gf_op_t ec_gf8_mul_F8_ops[] = { +    { EC_GF_OP_XOR2,   4,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  5,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F8 = { +    8, +    { 6, 2, 0, 1, 4, 5, 3, 7, }, +    ec_gf8_mul_F8_ops +}; + +static ec_gf_op_t ec_gf8_mul_F9_ops[] = { +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  4,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR3,   8,  7,  1 }, +    { EC_GF_OP_XOR2,   1,  3,  0 }, +    { EC_GF_OP_XOR2,   4,  8,  0 }, +    { EC_GF_OP_XOR2,   5,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_F9 = { +    9, +    { 4, 1, 7, 6, 0, 3, 5, 2, 8, }, +    ec_gf8_mul_F9_ops +}; + +static ec_gf_op_t ec_gf8_mul_FA_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  2,  0 }, +    { EC_GF_OP_XOR2,   1,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FA = { +    8, +    { 0, 1, 2, 4, 5, 6, 7, 3, }, +    ec_gf8_mul_FA_ops +}; + +static ec_gf_op_t ec_gf8_mul_FB_ops[] = { +    { EC_GF_OP_XOR2,   1,  0,  0 }, +    { EC_GF_OP_XOR2,   2,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   3,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  7,  0 }, +    { EC_GF_OP_XOR2,   2,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  6,  0 }, +    { EC_GF_OP_XOR2,   7,  6,  0 }, +    { EC_GF_OP_XOR2,   4,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  5,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FB = { +    8, +    { 4, 5, 6, 7, 0, 1, 2, 3, }, +    ec_gf8_mul_FB_ops +}; + +static ec_gf_op_t ec_gf8_mul_FC_ops[] = { +    { EC_GF_OP_XOR2,   7,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  1,  0 }, +    { EC_GF_OP_COPY,   9,  3,  0 }, +    { EC_GF_OP_XOR3,   8,  5,  7 }, +    { EC_GF_OP_XOR2,   3,  6,  0 }, +    { EC_GF_OP_XOR2,   8,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  8,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   3,  4,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   6,  0,  0 }, +    { EC_GF_OP_XOR3,   0,  9,  2 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FC = { +    10, +    { 5, 6, 3, 7, 1, 8, 0, 4, 2, 9, }, +    ec_gf8_mul_FC_ops +}; + +static ec_gf_op_t ec_gf8_mul_FD_ops[] = { +    { EC_GF_OP_XOR2,   7,  1,  0 }, +    { EC_GF_OP_COPY,   8,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  0,  0 }, +    { EC_GF_OP_XOR2,   7,  5,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  5,  0 }, +    { EC_GF_OP_XOR2,   1,  2,  0 }, +    { EC_GF_OP_XOR2,   0,  1,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR3,   1,  8,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FD = { +    9, +    { 5, 3, 7, 6, 1, 2, 4, 0, 8, }, +    ec_gf8_mul_FD_ops +}; + +static ec_gf_op_t ec_gf8_mul_FE_ops[] = { +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_COPY,   8,  2,  0 }, +    { EC_GF_OP_XOR2,   2,  4,  0 }, +    { EC_GF_OP_XOR2,   6,  2,  0 }, +    { EC_GF_OP_XOR2,   8,  5,  0 }, +    { EC_GF_OP_XOR2,   5,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  1,  0 }, +    { EC_GF_OP_XOR2,   0,  6,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   7,  8,  0 }, +    { EC_GF_OP_XOR2,   3,  0,  0 }, +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR2,   0,  4,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FE = { +    9, +    { 3, 4, 8, 2, 5, 0, 6, 1, 7, }, +    ec_gf8_mul_FE_ops +}; + +static ec_gf_op_t ec_gf8_mul_FF_ops[] = { +    { EC_GF_OP_XOR2,   4,  7,  0 }, +    { EC_GF_OP_COPY,   9,  0,  0 }, +    { EC_GF_OP_COPY,   8,  4,  0 }, +    { EC_GF_OP_XOR2,   9,  1,  0 }, +    { EC_GF_OP_XOR2,   4,  2,  0 }, +    { EC_GF_OP_XOR2,   9,  4,  0 }, +    { EC_GF_OP_XOR2,   0,  5,  0 }, +    { EC_GF_OP_XOR2,   2,  0,  0 }, +    { EC_GF_OP_XOR2,   3,  9,  0 }, +    { EC_GF_OP_XOR2,   7,  3,  0 }, +    { EC_GF_OP_XOR2,   2,  6,  0 }, +    { EC_GF_OP_XOR2,   5,  3,  0 }, +    { EC_GF_OP_XOR2,   6,  7,  0 }, +    { EC_GF_OP_XOR2,   1,  7,  0 }, +    { EC_GF_OP_XOR3,   3,  8,  5 }, +    { EC_GF_OP_XOR2,   4,  6,  0 }, +    { EC_GF_OP_END,    0,  0,  0 } +}; + +static ec_gf_mul_t ec_gf8_mul_FF = { +    10, +    { 6, 5, 0, 1, 2, 4, 9, 3, 7, 8, }, +    ec_gf8_mul_FF_ops +}; + +ec_gf_mul_t *ec_gf8_mul[] = { +    &ec_gf8_mul_00, &ec_gf8_mul_01, &ec_gf8_mul_02, &ec_gf8_mul_03, +    &ec_gf8_mul_04, &ec_gf8_mul_05, &ec_gf8_mul_06, &ec_gf8_mul_07, +    &ec_gf8_mul_08, &ec_gf8_mul_09, &ec_gf8_mul_0A, &ec_gf8_mul_0B, +    &ec_gf8_mul_0C, &ec_gf8_mul_0D, &ec_gf8_mul_0E, &ec_gf8_mul_0F, +    &ec_gf8_mul_10, &ec_gf8_mul_11, &ec_gf8_mul_12, &ec_gf8_mul_13, +    &ec_gf8_mul_14, &ec_gf8_mul_15, &ec_gf8_mul_16, &ec_gf8_mul_17, +    &ec_gf8_mul_18, &ec_gf8_mul_19, &ec_gf8_mul_1A, &ec_gf8_mul_1B, +    &ec_gf8_mul_1C, &ec_gf8_mul_1D, &ec_gf8_mul_1E, &ec_gf8_mul_1F, +    &ec_gf8_mul_20, &ec_gf8_mul_21, &ec_gf8_mul_22, &ec_gf8_mul_23, +    &ec_gf8_mul_24, &ec_gf8_mul_25, &ec_gf8_mul_26, &ec_gf8_mul_27, +    &ec_gf8_mul_28, &ec_gf8_mul_29, &ec_gf8_mul_2A, &ec_gf8_mul_2B, +    &ec_gf8_mul_2C, &ec_gf8_mul_2D, &ec_gf8_mul_2E, &ec_gf8_mul_2F, +    &ec_gf8_mul_30, &ec_gf8_mul_31, &ec_gf8_mul_32, &ec_gf8_mul_33, +    &ec_gf8_mul_34, &ec_gf8_mul_35, &ec_gf8_mul_36, &ec_gf8_mul_37, +    &ec_gf8_mul_38, &ec_gf8_mul_39, &ec_gf8_mul_3A, &ec_gf8_mul_3B, +    &ec_gf8_mul_3C, &ec_gf8_mul_3D, &ec_gf8_mul_3E, &ec_gf8_mul_3F, +    &ec_gf8_mul_40, &ec_gf8_mul_41, &ec_gf8_mul_42, &ec_gf8_mul_43, +    &ec_gf8_mul_44, &ec_gf8_mul_45, &ec_gf8_mul_46, &ec_gf8_mul_47, +    &ec_gf8_mul_48, &ec_gf8_mul_49, &ec_gf8_mul_4A, &ec_gf8_mul_4B, +    &ec_gf8_mul_4C, &ec_gf8_mul_4D, &ec_gf8_mul_4E, &ec_gf8_mul_4F, +    &ec_gf8_mul_50, &ec_gf8_mul_51, &ec_gf8_mul_52, &ec_gf8_mul_53, +    &ec_gf8_mul_54, &ec_gf8_mul_55, &ec_gf8_mul_56, &ec_gf8_mul_57, +    &ec_gf8_mul_58, &ec_gf8_mul_59, &ec_gf8_mul_5A, &ec_gf8_mul_5B, +    &ec_gf8_mul_5C, &ec_gf8_mul_5D, &ec_gf8_mul_5E, &ec_gf8_mul_5F, +    &ec_gf8_mul_60, &ec_gf8_mul_61, &ec_gf8_mul_62, &ec_gf8_mul_63, +    &ec_gf8_mul_64, &ec_gf8_mul_65, &ec_gf8_mul_66, &ec_gf8_mul_67, +    &ec_gf8_mul_68, &ec_gf8_mul_69, &ec_gf8_mul_6A, &ec_gf8_mul_6B, +    &ec_gf8_mul_6C, &ec_gf8_mul_6D, &ec_gf8_mul_6E, &ec_gf8_mul_6F, +    &ec_gf8_mul_70, &ec_gf8_mul_71, &ec_gf8_mul_72, &ec_gf8_mul_73, +    &ec_gf8_mul_74, &ec_gf8_mul_75, &ec_gf8_mul_76, &ec_gf8_mul_77, +    &ec_gf8_mul_78, &ec_gf8_mul_79, &ec_gf8_mul_7A, &ec_gf8_mul_7B, +    &ec_gf8_mul_7C, &ec_gf8_mul_7D, &ec_gf8_mul_7E, &ec_gf8_mul_7F, +    &ec_gf8_mul_80, &ec_gf8_mul_81, &ec_gf8_mul_82, &ec_gf8_mul_83, +    &ec_gf8_mul_84, &ec_gf8_mul_85, &ec_gf8_mul_86, &ec_gf8_mul_87, +    &ec_gf8_mul_88, &ec_gf8_mul_89, &ec_gf8_mul_8A, &ec_gf8_mul_8B, +    &ec_gf8_mul_8C, &ec_gf8_mul_8D, &ec_gf8_mul_8E, &ec_gf8_mul_8F, +    &ec_gf8_mul_90, &ec_gf8_mul_91, &ec_gf8_mul_92, &ec_gf8_mul_93, +    &ec_gf8_mul_94, &ec_gf8_mul_95, &ec_gf8_mul_96, &ec_gf8_mul_97, +    &ec_gf8_mul_98, &ec_gf8_mul_99, &ec_gf8_mul_9A, &ec_gf8_mul_9B, +    &ec_gf8_mul_9C, &ec_gf8_mul_9D, &ec_gf8_mul_9E, &ec_gf8_mul_9F, +    &ec_gf8_mul_A0, &ec_gf8_mul_A1, &ec_gf8_mul_A2, &ec_gf8_mul_A3, +    &ec_gf8_mul_A4, &ec_gf8_mul_A5, &ec_gf8_mul_A6, &ec_gf8_mul_A7, +    &ec_gf8_mul_A8, &ec_gf8_mul_A9, &ec_gf8_mul_AA, &ec_gf8_mul_AB, +    &ec_gf8_mul_AC, &ec_gf8_mul_AD, &ec_gf8_mul_AE, &ec_gf8_mul_AF, +    &ec_gf8_mul_B0, &ec_gf8_mul_B1, &ec_gf8_mul_B2, &ec_gf8_mul_B3, +    &ec_gf8_mul_B4, &ec_gf8_mul_B5, &ec_gf8_mul_B6, &ec_gf8_mul_B7, +    &ec_gf8_mul_B8, &ec_gf8_mul_B9, &ec_gf8_mul_BA, &ec_gf8_mul_BB, +    &ec_gf8_mul_BC, &ec_gf8_mul_BD, &ec_gf8_mul_BE, &ec_gf8_mul_BF, +    &ec_gf8_mul_C0, &ec_gf8_mul_C1, &ec_gf8_mul_C2, &ec_gf8_mul_C3, +    &ec_gf8_mul_C4, &ec_gf8_mul_C5, &ec_gf8_mul_C6, &ec_gf8_mul_C7, +    &ec_gf8_mul_C8, &ec_gf8_mul_C9, &ec_gf8_mul_CA, &ec_gf8_mul_CB, +    &ec_gf8_mul_CC, &ec_gf8_mul_CD, &ec_gf8_mul_CE, &ec_gf8_mul_CF, +    &ec_gf8_mul_D0, &ec_gf8_mul_D1, &ec_gf8_mul_D2, &ec_gf8_mul_D3, +    &ec_gf8_mul_D4, &ec_gf8_mul_D5, &ec_gf8_mul_D6, &ec_gf8_mul_D7, +    &ec_gf8_mul_D8, &ec_gf8_mul_D9, &ec_gf8_mul_DA, &ec_gf8_mul_DB, +    &ec_gf8_mul_DC, &ec_gf8_mul_DD, &ec_gf8_mul_DE, &ec_gf8_mul_DF, +    &ec_gf8_mul_E0, &ec_gf8_mul_E1, &ec_gf8_mul_E2, &ec_gf8_mul_E3, +    &ec_gf8_mul_E4, &ec_gf8_mul_E5, &ec_gf8_mul_E6, &ec_gf8_mul_E7, +    &ec_gf8_mul_E8, &ec_gf8_mul_E9, &ec_gf8_mul_EA, &ec_gf8_mul_EB, +    &ec_gf8_mul_EC, &ec_gf8_mul_ED, &ec_gf8_mul_EE, &ec_gf8_mul_EF, +    &ec_gf8_mul_F0, &ec_gf8_mul_F1, &ec_gf8_mul_F2, &ec_gf8_mul_F3, +    &ec_gf8_mul_F4, &ec_gf8_mul_F5, &ec_gf8_mul_F6, &ec_gf8_mul_F7, +    &ec_gf8_mul_F8, &ec_gf8_mul_F9, &ec_gf8_mul_FA, &ec_gf8_mul_FB, +    &ec_gf8_mul_FC, &ec_gf8_mul_FD, &ec_gf8_mul_FE, &ec_gf8_mul_FF +}; diff --git a/xlators/cluster/ec/src/ec-gf.h b/xlators/cluster/ec/src/ec-gf8.h index 23bca91e3b5..4aca91127fc 100644 --- a/xlators/cluster/ec/src/ec-gf.h +++ b/xlators/cluster/ec/src/ec-gf8.h @@ -1,5 +1,5 @@  /* -  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -11,13 +11,8 @@  #ifndef __EC_GF8_H__  #define __EC_GF8_H__ -#define EC_GF_BITS 8 -#define EC_GF_MOD 0x11D +#include "ec-galois.h" -#define EC_GF_SIZE (1 << EC_GF_BITS) -#define EC_GF_WORD_SIZE sizeof(uint64_t) - -extern void (* ec_gf_muladd[])(uint8_t * out, uint8_t * in, -                               unsigned int width); +extern ec_gf_mul_t *ec_gf8_mul[];  #endif /* __EC_GF8_H__ */ diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 7fe1b2c4f8b..14255616830 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -11,21 +11,22 @@  #include "xlator.h"  #include "defaults.h"  #include "compat-errno.h" +#include "byte-order.h" +#include "syncop.h" +#include "syncop-utils.h" +#include "cluster-syncop.h" +#include "ec.h" +#include "ec-mem-types.h" +#include "ec-types.h" +#include "ec-messages.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" -#include "ec-mem-types.h" -#include "ec-data.h" -#include "byte-order.h" -#include "ec-messages.h" -#include "syncop.h" -#include "syncop-utils.h" -#include "cluster-syncop.h" - +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })  #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })  #define EC_INTERSECT(dst, src1, src2, max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i]; })  #define EC_ADJUST_SOURCE(source, sources, max) ({int __i; if (sources[source] == 0) {source = -1; for (__i = 0; __i < max; __i++) if (sources[__i]) source = __i; } }) diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h index 0929044d545..4ae02e2df3c 100644 --- a/xlators/cluster/ec/src/ec-heald.h +++ b/xlators/cluster/ec/src/ec-heald.h @@ -13,32 +13,7 @@  #include "xlator.h" -struct _ec; -typedef struct _ec ec_t; - -struct subvol_healer { -        xlator_t        *this; -        int              subvol; -        gf_boolean_t     local; -        gf_boolean_t     running; -        gf_boolean_t     rerun; -        pthread_mutex_t  mutex; -        pthread_cond_t   cond; -        pthread_t        thread; -}; - -struct _ec_self_heald; -typedef struct _ec_self_heald ec_self_heald_t; - -struct _ec_self_heald { -        gf_boolean_t            iamshd; -        gf_boolean_t            enabled; -        int                     timeout; -        uint32_t                max_threads; -        uint32_t                wait_qlength; -        struct subvol_healer   *index_healers; -        struct subvol_healer   *full_healers; -}; +#include "ec-types.h"  int  ec_xl_op (xlator_t *this, dict_t *input, dict_t *output); @@ -46,4 +21,5 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);  int  ec_selfheal_daemon_init (xlator_t *this);  void ec_selfheal_childup (ec_t *ec, int child); +  #endif /* __EC_HEALD_H__ */ diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 7cf8232353d..2391b2de3ae 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -12,10 +12,12 @@  #include "byte-order.h" +#include "ec.h"  #include "ec-mem-types.h" +#include "ec-messages.h"  #include "ec-fops.h" +#include "ec-method.h"  #include "ec-helpers.h" -#include "ec-messages.h"  static const char * ec_fop_list[] =  { @@ -137,6 +139,53 @@ size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count,      return total;  } +int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, +                        void **ptr) +{ +    struct iobref *iobref = NULL; +    struct iobuf *iobuf = NULL; +    int32_t ret = -ENOMEM; + +    iobuf = iobuf_get_page_aligned (xl->ctx->iobuf_pool, size, +                                    EC_METHOD_WORD_SIZE); +    if (iobuf == NULL) { +        goto out; +    } + +    iobref = *piobref; +    if (iobref == NULL) { +        iobref = iobref_new(); +        if (iobref == NULL) { +            goto out; +        } +    } + +    ret = iobref_add(iobref, iobuf); +    if (ret != 0) { +        if (iobref != *piobref) { +            iobref_unref(iobref); +        } +        iobref = NULL; + +        goto out; +    } + +    GF_ASSERT(EC_ALIGN_CHECK(iobuf->ptr, EC_METHOD_WORD_SIZE)); + +    *ptr = iobuf->ptr; + +out: +    if (iobuf != NULL) { +        iobuf_unref(iobuf); +    } + +    if (iobref != NULL) { +        *piobref = iobref; +    } + +    return ret; +} +  int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t value[],                            int32_t size)  { diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index 93d77726089..dfea6fef537 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -11,7 +11,10 @@  #ifndef __EC_HELPERS_H__  #define __EC_HELPERS_H__ -#include "ec-data.h" +#include "ec-types.h" + +#define EC_ALIGN_CHECK(_ptr, _align) \ +    ((((uintptr_t)(_ptr)) & ((_align) - 1)) == 0)  const char * ec_bin(char * str, size_t size, uint64_t value, int32_t digits);  const char * ec_fop_name(int32_t id); @@ -19,7 +22,8 @@ void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...);  int32_t ec_bits_consume(uint64_t * n);  size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count,                        off_t offset, size_t size); - +int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, +                        void **ptr);  int32_t ec_dict_set_array(dict_t *dict, char *key,                            uint64_t *value, int32_t size);  int32_t ec_dict_del_array(dict_t *dict, char *key, diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index c3d9c879eb7..6752b675273 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -11,12 +11,13 @@  #include "xlator.h"  #include "defaults.h" +#include "ec.h" +#include "ec-messages.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" -#include "ec-messages.h"  /* FOP: access */ @@ -1140,12 +1141,12 @@ out:  int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)  { -    ec_cbk_data_t * ans = NULL; -    struct iobref * iobref = NULL; -    struct iobuf * iobuf = NULL; -    uint8_t * buff = NULL, * ptr; +    struct iovec vector[1]; +    ec_cbk_data_t *ans = NULL; +    struct iobref *iobref = NULL; +    void *ptr;      size_t fsize = 0, size = 0, max = 0; -    int32_t i = 0, err = -ENOMEM; +    int32_t pos, err = -ENOMEM;      if (cbk->op_ret < 0) {          err = -cbk->op_errno; @@ -1157,47 +1158,38 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)      GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size));      if (cbk->op_ret > 0) { -        struct iovec vector[1]; -        uint8_t * blocks[cbk->count]; +        void *blocks[cbk->count];          uint32_t values[cbk->count];          fsize = cbk->op_ret;          size = fsize * ec->fragments; -        buff = GF_MALLOC(size, gf_common_mt_char); -        if (buff == NULL) { -            goto out; -        } -        ptr = buff; -        for (i = 0, ans = cbk; ans != NULL; i++, ans = ans->next) { -            values[i] = ans->idx; -            blocks[i] = ptr; -            ptr += ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize); +        for (ans = cbk; ans != NULL; ans = ans->next) { +            pos = gf_bits_count(cbk->mask & ((1 << ans->idx) - 1)); +            values[pos] = ans->idx + 1; +            blocks[pos] = ans->vector[0].iov_base; +            if ((ans->int32 != 1) || +                !EC_ALIGN_CHECK(blocks[pos], EC_METHOD_WORD_SIZE)) { +                if (iobref == NULL) { +                    err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr); +                    if (err != 0) { +                        goto out; +                    } +                } +                ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize); +                blocks[pos] = ptr; +                ptr += fsize; +            }          } -        iobref = iobref_new(); -        if (iobref == NULL) { -            goto out; -        } -        iobuf = iobuf_get2(fop->xl->ctx->iobuf_pool, size); -        if (iobuf == NULL) { -            goto out; -        } -        err = iobref_add(iobref, iobuf); +        err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr);          if (err != 0) {              goto out;          } -        vector[0].iov_base = iobuf->ptr; -        vector[0].iov_len = ec_method_decode(fsize, ec->fragments, values, -                                             blocks, iobuf->ptr); +        ec_method_decode(&ec->matrix, fsize, cbk->mask, values, blocks, ptr); -        iobuf_unref(iobuf); - -        GF_FREE(buff); -        buff = NULL; - -        vector[0].iov_base += fop->head; -        vector[0].iov_len -= fop->head; +        vector[0].iov_base = ptr + fop->head; +        vector[0].iov_len = size - fop->head;          max = fop->offset * ec->fragments + size;          if (max > cbk->iatt[0].ia_size) { @@ -1229,13 +1221,9 @@ int32_t ec_readv_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)      return 0;  out: -    if (iobuf != NULL) { -        iobuf_unref(iobuf); -    }      if (iobref != NULL) {          iobref_unref(iobref);      } -    GF_FREE(buff);      return err;  } diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 6aeda5a2481..88145d98c83 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -11,12 +11,13 @@  #include "xlator.h"  #include "defaults.h" +#include "ec.h" +#include "ec-messages.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" -#include "ec-messages.h"  int  ec_inode_write_cbk (call_frame_t *frame, xlator_t *this, void *cookie, @@ -1285,27 +1286,78 @@ out:      return -1;  } +static int32_t +ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop) +{ +    struct iobref *iobref = NULL; +    struct iovec *iov; +    void *ptr; +    int32_t err; + +    fop->user_size = iov_length(fop->vector, fop->int32); +    fop->head = ec_adjust_offset(ec, &fop->offset, 0); +    fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0); + +    if ((fop->int32 != 1) || (fop->head != 0) || +        (fop->size > fop->user_size) || +        !EC_ALIGN_CHECK(fop->vector[0].iov_base, EC_METHOD_WORD_SIZE)) { +        err = ec_buffer_alloc(ec->xl, fop->size, &iobref, &ptr); +        if (err != 0) { +            goto out; +        } + +        ec_iov_copy_to(ptr + fop->head, fop->vector, fop->int32, 0, +                       fop->user_size); + +        fop->vector[0].iov_base = ptr; +        fop->vector[0].iov_len = fop->size; + +        iobref_unref(fop->buffers); +        fop->buffers = iobref; +    } + +    if (fop->int32 != 2) { +        iov = GF_MALLOC(VECTORSIZE(2), gf_common_mt_iovec); +        if (iov == NULL) { +            err = -ENOMEM; + +            goto out; +        } +        iov[0].iov_base = fop->vector[0].iov_base; +        iov[0].iov_len = fop->vector[0].iov_len; + +        GF_FREE(fop->vector); +        fop->vector = iov; +    } + +    fop->vector[1].iov_len = fop->size / ec->fragments; +    err = ec_buffer_alloc(ec->xl, fop->vector[1].iov_len * ec->nodes, +                          &fop->buffers, &fop->vector[1].iov_base); +    if (err != 0) { +        goto out; +    } + +    err = 0; + +out: +    return err; +} +  void ec_writev_start(ec_fop_data_t *fop)  {      ec_t *ec = fop->xl->private; -    struct iobref *iobref = NULL; -    struct iobuf *iobuf = NULL; -    void *ptr = NULL;      ec_fd_t *ctx;      fd_t *fd; -    size_t tail; -    uint64_t current; +    dict_t *xdata = NULL; +    uint64_t tail, current;      int32_t err = -ENOMEM; -    dict_t      *xdata = NULL;      /* This shouldn't fail because we have the inode locked. */      GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t));      fd = fd_anonymous(fop->fd->inode);      if (fd == NULL) { -        ec_fop_set_error(fop, ENOMEM); - -        return; +        goto failed;      }      fop->frame->root->uid = 0; @@ -1318,38 +1370,15 @@ void ec_writev_start(ec_fop_data_t *fop)          }      } -    fop->user_size = iov_length(fop->vector, fop->int32); -    fop->head = ec_adjust_offset(ec, &fop->offset, 0); -    fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0); - -    iobref = iobref_new(); -    if (iobref == NULL) { -        goto out; -    } -    iobuf = iobuf_get2(fop->xl->ctx->iobuf_pool, fop->size); -    if (iobuf == NULL) { -        goto out; -    } -    err = iobref_add(iobref, iobuf); +    err = ec_writev_prepare_buffers(ec, fop);      if (err != 0) { -        goto out; +        goto failed_fd;      } -    ptr = iobuf->ptr + fop->head; -    ec_iov_copy_to(ptr, fop->vector, fop->int32, 0, fop->user_size); - -    fop->vector[0].iov_base = iobuf->ptr; -    fop->vector[0].iov_len = fop->size; - -    iobuf_unref(iobuf); - -    iobref_unref(fop->buffers); -    fop->buffers = iobref; -      if (fop->head > 0) {          if (ec_make_internal_fop_xdata (&xdata)) {                  err = -ENOMEM; -                goto out; +                goto failed_xdata;          }          ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head,                   NULL, fd, ec->stripe_size, fop->offset, 0, xdata); @@ -1359,7 +1388,7 @@ void ec_writev_start(ec_fop_data_t *fop)          if (current > fop->offset + fop->head + fop->user_size) {              if (ec_make_internal_fop_xdata (&xdata)) {                      err = -ENOMEM; -                    goto out; +                    goto failed_xdata;              }              ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,                       ec_writev_merge_tail, NULL, fd, ec->stripe_size, @@ -1369,24 +1398,15 @@ void ec_writev_start(ec_fop_data_t *fop)          }      } -    fd_unref(fd); -    if (xdata) -            dict_unref (xdata); - -    return; +    err = 0; -out: -    if (iobuf != NULL) { -        iobuf_unref(iobuf); +failed_xdata: +    if (xdata) { +        dict_unref(xdata);      } -    if (iobref != NULL) { -        iobref_unref(iobref); -    } - +failed_fd:      fd_unref(fd); -    if (xdata) -            dict_unref (xdata); - +failed:      ec_fop_set_error(fop, -err);  } @@ -1411,55 +1431,32 @@ void ec_wind_writev(ec_t * ec, ec_fop_data_t * fop, int32_t idx)      ec_trace("WIND", fop, "idx=%d", idx);      struct iovec vector[1]; -    struct iobref * iobref = NULL; -    struct iobuf * iobuf = NULL; -    ssize_t size = 0, bufsize = 0; -    int32_t err = -ENOMEM; +    size_t size; -    iobref = iobref_new(); -    if (iobref == NULL) { -        goto out; -    } +    size = fop->vector[1].iov_len; -    size = fop->vector[0].iov_len; -    bufsize = size / ec->fragments; - -    iobuf = iobuf_get2(fop->xl->ctx->iobuf_pool, bufsize); -    if (iobuf == NULL) { -        goto out; -    } -    err = iobref_add(iobref, iobuf); -    if (err != 0) { -        goto out; -    } - -    ec_method_encode(size, ec->fragments, idx, fop->vector[0].iov_base, -                     iobuf->ptr); - -    vector[0].iov_base = iobuf->ptr; -    vector[0].iov_len = bufsize; - -    iobuf_unref(iobuf); +    vector[0].iov_base = fop->vector[1].iov_base + idx * size; +    vector[0].iov_len = size;      STACK_WIND_COOKIE(fop->frame, ec_writev_cbk, (void *)(uintptr_t)idx,                        ec->xl_list[idx], ec->xl_list[idx]->fops->writev,                        fop->fd, vector, 1, fop->offset / ec->fragments, -                      fop->uint32, iobref, fop->xdata); - -    iobref_unref(iobref); +                      fop->uint32, fop->buffers, fop->xdata); +} -    return; +static void +ec_writev_encode(ec_fop_data_t *fop) +{ +    ec_t *ec = fop->xl->private; +    void *blocks[ec->nodes]; +    uint32_t i; -out: -    if (iobuf != NULL) { -        iobuf_unref(iobuf); +    blocks[0] = fop->vector[1].iov_base; +    for (i = 1; i < ec->nodes; i++) { +        blocks[i] = blocks[i - 1] + fop->vector[1].iov_len;      } -    if (iobref != NULL) { -        iobref_unref(iobref); -    } - -    ec_writev_cbk(fop->frame, (void *)(uintptr_t)idx, fop->xl, -1, -err, NULL, -                  NULL, NULL); +    ec_method_encode(&ec->matrix, fop->vector[0].iov_len, +                     fop->vector[0].iov_base, blocks);  }  int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state) @@ -1488,6 +1485,8 @@ int32_t ec_manager_writev(ec_fop_data_t *fop, int32_t state)              fop->frame->root->uid = fop->uid;              fop->frame->root->gid = fop->gid; +            ec_writev_encode(fop); +              ec_dispatch_all(fop);              return EC_STATE_PREPARE_ANSWER; diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h index df65a031590..9a4b6c58049 100644 --- a/xlators/cluster/ec/src/ec-mem-types.h +++ b/xlators/cluster/ec/src/ec-mem-types.h @@ -21,6 +21,10 @@ enum gf_ec_mem_types_      ec_mt_ec_fd_t,      ec_mt_ec_heal_t,      ec_mt_subvol_healer_t, +    ec_mt_ec_gf_t, +    ec_mt_ec_code_t, +    ec_mt_ec_code_builder_t, +    ec_mt_ec_matrix_t,      ec_mt_end  }; diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h index 76678f8f836..dcdf50b9503 100644 --- a/xlators/cluster/ec/src/ec-messages.h +++ b/xlators/cluster/ec/src/ec-messages.h @@ -45,7 +45,7 @@   */  #define GLFS_EC_COMP_BASE       GLFS_MSGID_COMP_EC -#define GLFS_NUM_MESSAGES       66 +#define GLFS_NUM_MESSAGES       73  #define GLFS_MSGID_END          (GLFS_EC_COMP_BASE + GLFS_NUM_MESSAGES + 1)  /* Messaged with message IDs */  #define glfs_msg_start_x GLFS_EC_COMP_BASE, "Invalid: Start of messages" @@ -520,6 +520,55 @@   */  #define EC_MSG_CONFIG_XATTR_INVALID         (GLFS_EC_COMP_BASE + 66) +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION                    (GLFS_EC_COMP_BASE + 67) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_NONE               (GLFS_EC_COMP_BASE + 68) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_UNKNOWN            (GLFS_EC_COMP_BASE + 69) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_UNSUPPORTED        (GLFS_EC_COMP_BASE + 70) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_EXTENSION_FAILED             (GLFS_EC_COMP_BASE + 71) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_NO_GF                        (GLFS_EC_COMP_BASE + 72) + +/*! + * @messageid + * @diagnosis + * @recommendedaction + */ +#define EC_MSG_MATRIX_FAILED                (GLFS_EC_COMP_BASE + 73) +  /*------------*/  #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c index faab0115cdd..d1b122fb6a4 100644 --- a/xlators/cluster/ec/src/ec-method.c +++ b/xlators/cluster/ec/src/ec-method.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> +  Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -11,149 +11,432 @@  #include <string.h>  #include <inttypes.h> -#include "ec-gf.h" +#include "ec-types.h" +#include "ec-mem-types.h" +#include "ec-galois.h" +#include "ec-code.h"  #include "ec-method.h" -static uint32_t GfPow[EC_GF_SIZE << 1]; -static uint32_t GfLog[EC_GF_SIZE << 1]; +static void +ec_method_matrix_normal(ec_gf_t *gf, uint32_t *matrix, uint32_t columns, +                        uint32_t *values, uint32_t count) +{ +    uint32_t i, j, v, tmp; + +    columns--; +    for (i = 0; i < count; i++) { +        v = *values++; +        *matrix++ = tmp = ec_gf_exp(gf, v, columns); +        for (j = 0; j < columns; j++) { +            *matrix++ = tmp = ec_gf_div(gf, tmp, v); +        } +    } +} + +static void +ec_method_matrix_inverse(ec_gf_t *gf, uint32_t *matrix, uint32_t *values, +                         uint32_t count) +{ +    uint32_t a[count]; +    uint32_t i, j, p, last, tmp; + +    last = count - 1; +    for (i = 0; i < last; i++) { +        a[i] = 1; +    } +    a[i] = values[0]; +    for (i = last; i > 0; i--) { +        for (j = i - 1; j < last; j++) { +            a[j] = a[j + 1] ^ ec_gf_mul(gf, values[i], a[j]); +        } +        a[j] = ec_gf_mul(gf, values[i], a[j]); +    } +    for (i = 0; i < count; i++) { +        p = a[0]; +        matrix += count; +        *matrix = tmp = p ^ values[i]; +        for (j = 1; j < last; j++) { +            matrix += count; +            *matrix = tmp = a[j] ^ ec_gf_mul(gf, values[i], tmp); +            p = tmp ^ ec_gf_mul(gf, values[i], p); +        } +        for (j = 0; j < last; j++) { +            *matrix = ec_gf_div(gf, *matrix, p); +            matrix -= count; +        } +        *matrix = ec_gf_div(gf, 1, p); +        matrix++; +    } +} -void ec_method_initialize(void) +static gf_boolean_t +ec_method_matrix_init(ec_matrix_list_t *list, ec_matrix_t *matrix, +                      uintptr_t mask, uint32_t *rows, gf_boolean_t inverse)  {      uint32_t i; -    GfPow[0] = 1; -    GfLog[0] = EC_GF_SIZE; -    for (i = 1; i < EC_GF_SIZE; i++) -    { -        GfPow[i] = GfPow[i - 1] << 1; -        if (GfPow[i] >= EC_GF_SIZE) -        { -            GfPow[i] ^= EC_GF_MOD; +    matrix->refs = 1; +    matrix->mask = mask; +    matrix->code = list->code; +    matrix->columns = list->columns; +    INIT_LIST_HEAD(&matrix->lru); + +    if (inverse) { +        matrix->rows = list->columns; +        ec_method_matrix_inverse(matrix->code->gf, matrix->values, rows, +                                 matrix->rows); +        for (i = 0; i < matrix->rows; i++) { +            matrix->row_data[i].values = matrix->values + i * matrix->columns; +            matrix->row_data[i].func.interleaved = +                ec_code_build_interleaved(matrix->code, +                                          EC_METHOD_WORD_SIZE, +                                          matrix->row_data[i].values, +                                          matrix->columns); +            if (matrix->row_data[i].func.interleaved == NULL) { +                return _gf_false; +            } +        } +    } else { +        matrix->rows = list->rows; +        ec_method_matrix_normal(matrix->code->gf, matrix->values, +                                matrix->columns, rows, matrix->rows); +        for (i = 0; i < matrix->rows; i++) { +            matrix->row_data[i].values = matrix->values + i * matrix->columns; +            matrix->row_data[i].func.linear = +                ec_code_build_linear(matrix->code, EC_METHOD_WORD_SIZE, +                                     matrix->row_data[i].values, +                                     matrix->columns); +            if (matrix->row_data[i].func.linear == NULL) { +                return _gf_false; +            } +        } +    } + +    return _gf_true; +} + +static void +ec_method_matrix_release(ec_matrix_t *matrix) +{ +    uint32_t i; + +    for (i = 0; i < matrix->rows; i++) { +        if (matrix->row_data[i].func.linear != NULL) { +            ec_code_release(matrix->code, &matrix->row_data[i].func); +            matrix->row_data[i].func.linear = NULL;          } -        GfPow[i + EC_GF_SIZE - 1] = GfPow[i]; -        GfLog[GfPow[i] + EC_GF_SIZE - 1] = GfLog[GfPow[i]] = i;      }  } -static uint32_t ec_method_mul(uint32_t a, uint32_t b) +static void +ec_method_matrix_destroy(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ +    list_del_init(&matrix->lru); + +    ec_method_matrix_release(matrix); + +    mem_put(matrix); + +    list->count--; +} + +static void +ec_method_matrix_unref(ec_matrix_list_t *list, ec_matrix_t *matrix)  { -    if (a && b) -    { -        return GfPow[GfLog[a] + GfLog[b]]; +    if (--matrix->refs == 0) { +        list_add_tail(&matrix->lru, &list->lru); +        if (list->count > list->max) { +            matrix = list_first_entry(&list->lru, ec_matrix_t, lru); +            ec_method_matrix_destroy(list, matrix); +        }      } -    return 0;  } -static uint32_t ec_method_div(uint32_t a, uint32_t b) +static ec_matrix_t * +ec_method_matrix_lookup(ec_matrix_list_t *list, uintptr_t mask, uint32_t *pos)  { -    if (b) -    { -        if (a) -        { -            return GfPow[EC_GF_SIZE - 1 + GfLog[a] - GfLog[b]]; +    ec_matrix_t *matrix; +    uint32_t i, j, k; + +    i = 0; +    j = list->count; +    while (i < j) { +        k = (i + j) >> 1; +        matrix = list->objects[k]; +        if (matrix->mask == mask) { +            *pos = k; +            return matrix; +        } +        if (matrix->mask < mask) { +            i = k + 1; +        } else { +            j = k;          } -        return 0;      } -    return EC_GF_SIZE; +    *pos = i; + +    return NULL;  } -size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row, -                        uint8_t * in, uint8_t * out) +static void +ec_method_matrix_remove(ec_matrix_list_t *list, uintptr_t mask)  { -    uint32_t i, j; +    uint32_t pos; -    size /= EC_METHOD_CHUNK_SIZE * columns; -    row++; -    for (j = 0; j < size; j++) -    { -        ec_gf_muladd[0](out, in, EC_METHOD_WIDTH); -        in += EC_METHOD_CHUNK_SIZE; -        for (i = 1; i < columns; i++) -        { -            ec_gf_muladd[row](out, in, EC_METHOD_WIDTH); -            in += EC_METHOD_CHUNK_SIZE; +    if (ec_method_matrix_lookup(list, mask, &pos) != NULL) { +        list->count--; +        if (pos < list->count) { +            memmove(list->objects + pos, list->objects + pos + 1, +                    sizeof(ec_matrix_t *) * (list->count - pos));          } -        out += EC_METHOD_CHUNK_SIZE;      } +} + +static void +ec_method_matrix_insert(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ +    uint32_t pos; + +    GF_ASSERT(ec_method_matrix_lookup(list, matrix->mask, &pos) == NULL); -    return size * EC_METHOD_CHUNK_SIZE; +    if (pos < list->count) { +        memmove(list->objects + pos + 1, list->objects + pos, +                sizeof(ec_matrix_t *) * (list->count - pos)); +    } +    list->objects[pos] = matrix; +    list->count++;  } -size_t ec_method_decode(size_t size, uint32_t columns, uint32_t * rows, -                        uint8_t ** in, uint8_t * out) +static ec_matrix_t * +ec_method_matrix_get(ec_matrix_list_t *list, uintptr_t mask, uint32_t *rows)  { -    uint32_t i, j, k, off, last, value; -    uint32_t f; -    uint8_t inv[EC_METHOD_MAX_FRAGMENTS][EC_METHOD_MAX_FRAGMENTS + 1]; -    uint8_t mtx[EC_METHOD_MAX_FRAGMENTS][EC_METHOD_MAX_FRAGMENTS]; -    uint8_t dummy[EC_METHOD_CHUNK_SIZE]; +    ec_matrix_t *matrix; +    uint32_t pos; + +    LOCK(&list->lock); -    size /= EC_METHOD_CHUNK_SIZE; +    matrix = ec_method_matrix_lookup(list, mask, &pos); +    if (matrix != NULL) { +        list_del_init(&matrix->lru); +        matrix->refs++; -    memset(inv, 0, sizeof(inv)); -    memset(mtx, 0, sizeof(mtx)); -    memset(dummy, 0, sizeof(dummy)); -    for (i = 0; i < columns; i++) -    { -        inv[i][i] = 1; -        inv[i][columns] = 1; +        goto out;      } -    for (i = 0; i < columns; i++) -    { -        mtx[i][columns - 1] = 1; -        for (j = columns - 1; j > 0; j--) -        { -            mtx[i][j - 1] = ec_method_mul(mtx[i][j], rows[i] + 1); + +    if ((list->count >= list->max) && !list_empty(&list->lru)) { +        matrix = list_first_entry(&list->lru, ec_matrix_t, lru); +        list_del_init(&matrix->lru); + +        ec_method_matrix_remove(list, matrix->mask); + +        ec_method_matrix_release(matrix); +    } else { +        matrix = mem_get0(list->pool); +        if (matrix == NULL) { +            goto out;          } +        matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) + +                                      sizeof(ec_matrix_row_t) * list->columns);      } -    for (i = 0; i < columns; i++) -    { -        f = mtx[i][i]; -        for (j = 0; j < columns; j++) -        { -            mtx[i][j] = ec_method_div(mtx[i][j], f); -            inv[i][j] = ec_method_div(inv[i][j], f); -        } -        for (j = 0; j < columns; j++) -        { -            if (i != j) -            { -                f = mtx[j][i]; -                for (k = 0; k < columns; k++) -                { -                    mtx[j][k] ^= ec_method_mul(mtx[i][k], f); -                    inv[j][k] ^= ec_method_mul(inv[i][k], f); -                } -            } +    if (!ec_method_matrix_init(list, matrix, mask, rows, _gf_true)) { +        ec_method_matrix_unref(list, matrix); + +        matrix = NULL; + +        goto out; +    } + +    if (list->count < list->max) { +        ec_method_matrix_insert(list, matrix); +    } else { +        matrix->mask = 0; +    } + +out: +    UNLOCK(&list->lock); + +    return matrix; +} + +static void +ec_method_matrix_put(ec_matrix_list_t *list, ec_matrix_t *matrix) +{ +    LOCK(&list->lock); + +    ec_method_matrix_unref(list, matrix); + +    UNLOCK(&list->lock); +} + +static gf_boolean_t +ec_method_setup(xlator_t *xl, ec_matrix_list_t *list, const char *gen) +{ +    ec_matrix_t *matrix; +    uint32_t values[list->rows]; +    uint32_t i; + +    matrix = GF_MALLOC(sizeof(ec_matrix_t) + +                       sizeof(ec_matrix_row_t) * list->rows + +                       sizeof(uint32_t) * list->columns * list->rows, +                       ec_mt_ec_matrix_t); +    if (matrix == NULL) { +        goto failed; +    } +    memset(matrix, 0, sizeof(ec_matrix_t)); +    matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) + +                                  sizeof(ec_matrix_row_t) * list->rows); + +    list->code = ec_code_create(list->gf, ec_code_detect(xl, gen)); +    if (list->code == NULL) { +        goto failed_matrix; +    } +    list->width = list->code->width; + +    for (i = 0; i < list->rows; i++) { +        values[i] = i + 1; +    } +    if (!ec_method_matrix_init(list, matrix, 0, values, _gf_false)) { +        goto failed_code; +    } + +    list->encode = matrix; + +    return _gf_true; + +failed_code: +    ec_code_destroy(list->code); +failed_matrix: +    GF_FREE(matrix); +failed: +    return _gf_false; +} + +gf_boolean_t +ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns, +               uint32_t rows, uint32_t max, const char *gen) +{ +    list->columns = columns; +    list->rows = rows; +    list->max = max; +    list->stripe = EC_METHOD_CHUNK_SIZE * list->columns; +    INIT_LIST_HEAD(&list->lru); + +    list->pool = mem_pool_new_fn(sizeof(ec_matrix_t) + +                                 sizeof(ec_matrix_row_t) * columns + +                                 sizeof(uint32_t) * columns * columns, +                                 128, "ec_matrix_t"); +    if (list->pool == NULL) { +        goto failed; +    } + +    list->objects = GF_MALLOC(sizeof(ec_matrix_t *) * max, ec_mt_ec_matrix_t); +    if (list->objects == NULL) { +        goto failed_pool; +    } + +    list->gf = ec_gf_prepare(EC_GF_BITS, EC_GF_MOD); +    if (list->gf == NULL) { +        goto failed_objects; +    } + +    if (!ec_method_setup(xl, list, gen)) { +        goto failed_gf; +    } + +    LOCK_INIT(&list->lock); + +    return _gf_true; + +failed_gf: +    ec_gf_destroy(list->gf); +failed_objects: +    GF_FREE(list->objects); +failed_pool: +    mem_pool_destroy(list->pool); +failed: +    list->pool = NULL; +    list->objects = NULL; +    list->gf = NULL; +    return _gf_false; +} + +void +ec_method_fini(ec_matrix_list_t *list) +{ +    ec_matrix_t *matrix; + +    if (list->encode == NULL) { +        return; +    } + +    while (!list_empty(&list->lru)) { +        matrix = list_first_entry(&list->lru, ec_matrix_t, lru); +        ec_method_matrix_destroy(list, matrix); +    } + +    GF_ASSERT(list->count == 0); + +    if (list->pool)/*Init was successful*/ +            LOCK_DESTROY(&list->lock); + +    ec_method_matrix_release(list->encode); +    GF_FREE(list->encode); + +    ec_code_destroy(list->code); +    ec_gf_destroy(list->gf); +    GF_FREE(list->objects); +    mem_pool_destroy(list->pool); +} + +gf_boolean_t +ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen) +{ +    /* TODO: Allow changing code generator */ + +    return _gf_true; +} + +void +ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, void **out) +{ +    ec_matrix_t *matrix; +    size_t pos; +    uint32_t i; + +    matrix = list->encode; +    for (pos = 0; pos < size; pos += list->stripe) { +        for (i = 0; i < matrix->rows; i++) { +            matrix->row_data[i].func.linear(out[i], in, pos, +                                            matrix->row_data[i].values, +                                            list->columns); +            out[i] += EC_METHOD_CHUNK_SIZE;          }      } -    off = 0; -    for (f = 0; f < size; f++) -    { -        for (i = 0; i < columns; i++) -        { -            last = 0; -            j = 0; -            do -            { -                while (inv[i][j] == 0) -                { -                    j++; -                } -                if (j < columns) -                { -                    value = ec_method_div(last, inv[i][j]); -                    last = inv[i][j]; -                    ec_gf_muladd[value](out, in[j] + off, EC_METHOD_WIDTH); -                    j++; -                } -            } while (j < columns); -            ec_gf_muladd[last](out, dummy, EC_METHOD_WIDTH); +} + +gf_boolean_t +ec_method_decode(ec_matrix_list_t *list, size_t size, uintptr_t mask, +                 uint32_t *rows, void **in, void *out) +{ +    ec_matrix_t *matrix; +    size_t pos; +    uint32_t i; + +    matrix = ec_method_matrix_get(list, mask, rows); +    if (matrix == NULL) { +        return _gf_false; +    } +    for (pos = 0; pos < size; pos += EC_METHOD_CHUNK_SIZE) { +        for (i = 0; i < matrix->rows; i++) { +            matrix->row_data[i].func.interleaved(out, in, pos, +                                                 matrix->row_data[i].values, +                                                 list->columns);              out += EC_METHOD_CHUNK_SIZE;          } -        off += EC_METHOD_CHUNK_SIZE;      } -    return size * EC_METHOD_CHUNK_SIZE * columns; +    ec_method_matrix_put(list, matrix); + +    return _gf_true;  } diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h index 29b46e10443..818b54de872 100644 --- a/xlators/cluster/ec/src/ec-method.h +++ b/xlators/cluster/ec/src/ec-method.h @@ -1,5 +1,5 @@  /* -  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> +  Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -11,7 +11,15 @@  #ifndef __EC_METHOD_H__  #define __EC_METHOD_H__ -#include "ec-gf.h" +#include "xlator.h" + +#include "ec-types.h" +#include "ec-galois.h" + +#define EC_GF_BITS 8 +#define EC_GF_MOD 0x11D + +#define EC_GF_SIZE (1 << EC_GF_BITS)  /* Determines the maximum size of the matrix used to encode/decode data */  #define EC_METHOD_MAX_FRAGMENTS 16 @@ -21,12 +29,18 @@  #define EC_METHOD_WORD_SIZE 64  #define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_GF_BITS) -#define EC_METHOD_WIDTH (EC_METHOD_WORD_SIZE / EC_GF_WORD_SIZE) -void ec_method_initialize(void); -size_t ec_method_encode(size_t size, uint32_t columns, uint32_t row, -                        uint8_t * in, uint8_t * out); -size_t ec_method_decode(size_t size, uint32_t columns, uint32_t * rows, -                        uint8_t ** in, uint8_t * out); +gf_boolean_t ec_method_init(xlator_t *xl, ec_matrix_list_t *list, +                            uint32_t columns, uint32_t rows, uint32_t max, +                            const char *gen); +void ec_method_fini(ec_matrix_list_t *list); +gf_boolean_t ec_method_update(xlator_t *xl, ec_matrix_list_t *list, +                              const char *gen); + +void ec_method_encode(ec_matrix_list_t *list, size_t size, void *in, +                      void **out); +gf_boolean_t ec_method_decode(ec_matrix_list_t *list, size_t size, +                              uintptr_t mask, uint32_t *rows, void **in, +                              void *out);  #endif /* __EC_METHOD_H__ */ diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h new file mode 100644 index 00000000000..29f892f01be --- /dev/null +++ b/xlators/cluster/ec/src/ec-types.h @@ -0,0 +1,580 @@ +/* +  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __EC_TYPES_H__ +#define __EC_TYPES_H__ + +#include "xlator.h" +#include "timer.h" +#include "libxlator.h" + +#define EC_GF_MAX_REGS 16 + +#define EC_CODE_SIZE (1024 * 64) + +enum _ec_read_policy; +typedef enum _ec_read_policy ec_read_policy_t; + +struct _ec_config; +typedef struct _ec_config ec_config_t; + +struct _ec_fd; +typedef struct _ec_fd ec_fd_t; + +struct _ec_inode; +typedef struct _ec_inode ec_inode_t; + +union _ec_cbk; +typedef union _ec_cbk ec_cbk_t; + +struct _ec_lock; +typedef struct _ec_lock ec_lock_t; + +struct _ec_lock_link; +typedef struct _ec_lock_link ec_lock_link_t; + +struct _ec_fop_data; +typedef struct _ec_fop_data ec_fop_data_t; + +struct _ec_cbk_data; +typedef struct _ec_cbk_data ec_cbk_data_t; + +enum _ec_gf_opcode; +typedef enum _ec_gf_opcode ec_gf_opcode_t; + +struct _ec_gf_op; +typedef struct _ec_gf_op ec_gf_op_t; + +struct _ec_gf_mul; +typedef struct _ec_gf_mul ec_gf_mul_t; + +struct _ec_gf; +typedef struct _ec_gf ec_gf_t; + +struct _ec_code_gen; +typedef struct _ec_code_gen ec_code_gen_t; + +struct _ec_code; +typedef struct _ec_code ec_code_t; + +struct _ec_code_arg; +typedef struct _ec_code_arg ec_code_arg_t; + +struct _ec_code_op; +typedef struct _ec_code_op ec_code_op_t; + +struct _ec_code_builder; +typedef struct _ec_code_builder ec_code_builder_t; + +struct _ec_code_chunk; +typedef struct _ec_code_chunk ec_code_chunk_t; + +struct _ec_code_space; +typedef struct _ec_code_space ec_code_space_t; + +typedef void (*ec_code_func_linear_t)(void *dst, void *src, uint64_t offset, +                                      uint32_t *values, uint32_t count); + +typedef void (*ec_code_func_interleaved_t)(void *dst, void **src, +                                           uint64_t offset, uint32_t *values, +                                           uint32_t count); + +union _ec_code_func; +typedef union _ec_code_func ec_code_func_t; + +struct _ec_matrix_row; +typedef struct _ec_matrix_row ec_matrix_row_t; + +struct _ec_matrix; +typedef struct _ec_matrix ec_matrix_t; + +struct _ec_matrix_list; +typedef struct _ec_matrix_list ec_matrix_list_t; + +struct _ec_heal; +typedef struct _ec_heal ec_heal_t; + +struct _ec_self_heald; +typedef struct _ec_self_heald ec_self_heald_t; + +struct _ec; +typedef struct _ec ec_t; + +typedef void (*ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t); +typedef int32_t (*ec_handler_f)(ec_fop_data_t *, int32_t); +typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t); + +enum _ec_read_policy { +        EC_ROUND_ROBIN, +        EC_GFID_HASH, +        EC_READ_POLICY_MAX +}; + +struct _ec_config { +    uint32_t version; +    uint8_t  algorithm; +    uint8_t  gf_word_size; +    uint8_t  bricks; +    uint8_t  redundancy; +    uint32_t chunk_size; +}; + +struct _ec_fd { +    loc_t     loc; +    uintptr_t open; +    int32_t   flags; +}; + +struct _ec_inode { +    ec_lock_t        *inode_lock; +    gf_boolean_t      have_info; +    gf_boolean_t      have_config; +    gf_boolean_t      have_version; +    gf_boolean_t      have_size; +    ec_config_t       config; +    uint64_t          pre_version[2]; +    uint64_t          post_version[2]; +    uint64_t          pre_size; +    uint64_t          post_size; +    uint64_t          dirty[2]; +    struct list_head  heal; +}; + +typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, +                                  int32_t, uintptr_t, uintptr_t, uintptr_t, +                                  dict_t *); +typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, +                                   int32_t, uintptr_t, uintptr_t, uintptr_t, +                                   dict_t *); + +union _ec_cbk { +    fop_access_cbk_t       access; +    fop_create_cbk_t       create; +    fop_discard_cbk_t      discard; +    fop_entrylk_cbk_t      entrylk; +    fop_fentrylk_cbk_t     fentrylk; +    fop_fallocate_cbk_t    fallocate; +    fop_flush_cbk_t        flush; +    fop_fsync_cbk_t        fsync; +    fop_fsyncdir_cbk_t     fsyncdir; +    fop_getxattr_cbk_t     getxattr; +    fop_fgetxattr_cbk_t    fgetxattr; +    fop_heal_cbk_t         heal; +    fop_fheal_cbk_t        fheal; +    fop_inodelk_cbk_t      inodelk; +    fop_finodelk_cbk_t     finodelk; +    fop_link_cbk_t         link; +    fop_lk_cbk_t           lk; +    fop_lookup_cbk_t       lookup; +    fop_mkdir_cbk_t        mkdir; +    fop_mknod_cbk_t        mknod; +    fop_open_cbk_t         open; +    fop_opendir_cbk_t      opendir; +    fop_readdir_cbk_t      readdir; +    fop_readdirp_cbk_t     readdirp; +    fop_readlink_cbk_t     readlink; +    fop_readv_cbk_t        readv; +    fop_removexattr_cbk_t  removexattr; +    fop_fremovexattr_cbk_t fremovexattr; +    fop_rename_cbk_t       rename; +    fop_rmdir_cbk_t        rmdir; +    fop_setattr_cbk_t      setattr; +    fop_fsetattr_cbk_t     fsetattr; +    fop_setxattr_cbk_t     setxattr; +    fop_fsetxattr_cbk_t    fsetxattr; +    fop_stat_cbk_t         stat; +    fop_fstat_cbk_t        fstat; +    fop_statfs_cbk_t       statfs; +    fop_symlink_cbk_t      symlink; +    fop_truncate_cbk_t     truncate; +    fop_ftruncate_cbk_t    ftruncate; +    fop_unlink_cbk_t       unlink; +    fop_writev_cbk_t       writev; +    fop_xattrop_cbk_t      xattrop; +    fop_fxattrop_cbk_t     fxattrop; +    fop_zerofill_cbk_t     zerofill; +    fop_seek_cbk_t         seek; +}; + +struct _ec_lock { +    ec_inode_t        *ctx; +    gf_timer_t        *timer; + +    /* List of owners of this lock. All fops added to this list are running +     * concurrently. */ +    struct list_head   owners; + +    /* List of fops waiting to be an owner of the lock. Fops are added to this +     * list when the current owner has an incompatible access (shared vs +     * exclusive) or the lock is not acquired yet. */ +    struct list_head   waiting; + +    /* List of fops that will wait until the next unlock/lock cycle. This +     * happens when the currently acquired lock is decided to be released as +     * soon as possible. In this case, all frozen fops will be continued only +     * after the lock is reacquired. */ +    struct list_head   frozen; + +    int32_t            exclusive; +    uintptr_t          mask; +    uintptr_t          good_mask; +    uintptr_t          healing; +    uint32_t           refs_owners;  /* Refs for fops owning the lock */ +    uint32_t           refs_pending; /* Refs assigned to fops being prepared */ +    gf_boolean_t       acquired; +    gf_boolean_t       getting_size; +    gf_boolean_t       release; +    gf_boolean_t       query; +    fd_t              *fd; +    loc_t              loc; +    union { +        entrylk_type     type; +        struct gf_flock  flock; +    }; +}; + +struct _ec_lock_link { +    ec_lock_t        *lock; +    ec_fop_data_t    *fop; +    struct list_head  owner_list; +    struct list_head  wait_list; +    gf_boolean_t      update[2]; +    loc_t            *base; +    uint64_t          size; +}; + +struct _ec_fop_data { +    int32_t            id; +    int32_t            refs; +    int32_t            state; +    int32_t            minimum; +    int32_t            expected; +    int32_t            winds; +    int32_t            jobs; +    int32_t            error; +    ec_fop_data_t     *parent; +    xlator_t          *xl; +    call_frame_t      *req_frame;    /* frame of the calling xlator */ +    call_frame_t      *frame;        /* frame used by this fop */ +    struct list_head   cbk_list;     /* sorted list of groups of answers */ +    struct list_head   answer_list;  /* list of answers */ +    struct list_head   pending_list; /* member of ec_t.pending_fops */ +    ec_cbk_data_t     *answer;       /* accepted answer */ +    int32_t            lock_count; +    int32_t            locked; +    ec_lock_link_t     locks[2]; +    int32_t            first_lock; +    gf_lock_t          lock; + +    uint32_t           flags; +    uint32_t           first; +    uintptr_t          mask; +    uintptr_t          healing; /*Dispatch is done but call is successful only +                                  if fop->minimum number of subvolumes succeed +                                  which are not healing*/ +    uintptr_t          remaining; +    uintptr_t          received; /* Mask of responses */ +    uintptr_t          good; + +    uid_t              uid; +    gid_t              gid; + +    ec_wind_f          wind; +    ec_handler_f       handler; +    ec_resume_f        resume; +    ec_cbk_t           cbks; +    void              *data; +    ec_heal_t         *heal; +    struct list_head   healer; + +    uint64_t           user_size; +    uint32_t           head; + +    int32_t            use_fd; + +    dict_t            *xdata; +    dict_t            *dict; +    int32_t            int32; +    uint32_t           uint32; +    uint64_t           size; +    off_t              offset; +    mode_t             mode[2]; +    entrylk_cmd        entrylk_cmd; +    entrylk_type       entrylk_type; +    gf_xattrop_flags_t xattrop_flags; +    dev_t              dev; +    inode_t           *inode; +    fd_t              *fd; +    struct iatt        iatt; +    char              *str[2]; +    loc_t              loc[2]; +    struct gf_flock    flock; +    struct iovec      *vector; +    struct iobref     *buffers; +    gf_seek_what_t     seek; +}; + +struct _ec_cbk_data { +    struct list_head  list;        /* item in the sorted list of groups */ +    struct list_head  answer_list; /* item in the list of answers */ +    ec_fop_data_t    *fop; +    ec_cbk_data_t    *next;        /* next answer in the same group */ +    int32_t           idx; +    int32_t           op_ret; +    int32_t           op_errno; +    int32_t           count; +    uintptr_t         mask; +    uint64_t          dirty[2]; + +    dict_t           *xdata; +    dict_t           *dict; +    int32_t           int32; +    uintptr_t         uintptr[3]; +    uint64_t          size; +    uint64_t          version[2]; +    inode_t          *inode; +    fd_t             *fd; +    struct statvfs    statvfs; +    struct iatt       iatt[5]; +    struct gf_flock   flock; +    struct iovec     *vector; +    struct iobref    *buffers; +    char             *str; +    gf_dirent_t       entries; +    off_t             offset; +    gf_seek_what_t    what; +}; + +enum _ec_gf_opcode { +    EC_GF_OP_LOAD, +    EC_GF_OP_STORE, +    EC_GF_OP_COPY, +    EC_GF_OP_XOR2, +    EC_GF_OP_XOR3, +    EC_GF_OP_XORM, +    EC_GF_OP_END +}; + +struct _ec_gf_op { +    ec_gf_opcode_t op; +    uint32_t       arg1; +    uint32_t       arg2; +    uint32_t       arg3; +}; + +struct _ec_gf_mul { +    uint32_t    regs; +    uint32_t    map[EC_GF_MAX_REGS]; +    ec_gf_op_t *ops; +}; + +struct _ec_gf { +    uint32_t      bits; +    uint32_t      size; +    uint32_t      mod; +    uint32_t      min_ops; +    uint32_t      max_ops; +    uint32_t      avg_ops; +    uint32_t     *log; +    uint32_t     *pow; +    ec_gf_mul_t **table; +}; + +struct _ec_code_gen { +    char *name; +    char **flags; +    uint32_t width; + +    void (*prolog)(ec_code_builder_t *builder); +    void (*epilog)(ec_code_builder_t *builder); +    void (*load)(ec_code_builder_t *builder, uint32_t reg, uint32_t offset, +                 uint32_t bit); +    void (*store)(ec_code_builder_t *builder, uint32_t reg, uint32_t bit); +    void (*copy)(ec_code_builder_t *builder, uint32_t dst, uint32_t src); +    void (*xor2)(ec_code_builder_t *builder, uint32_t dst, uint32_t src); +    void (*xor3)(ec_code_builder_t *builder, uint32_t dst, uint32_t src1, +                 uint32_t src2); +    void (*xorm)(ec_code_builder_t *builder, uint32_t dst, uint32_t offset, +                 uint32_t bit); +}; + +struct _ec_code { +    gf_lock_t         lock; +    struct list_head  spaces; +    uint32_t          width; +    ec_gf_t          *gf; +    ec_code_gen_t    *gen; +}; + +struct _ec_code_arg { +    uint32_t          value; +}; + +struct _ec_code_op { +    ec_gf_opcode_t    op; +    ec_code_arg_t     arg1; +    ec_code_arg_t     arg2; +    ec_code_arg_t     arg3; +}; + +struct _ec_code_builder { +    ec_code_t        *code; +    uint64_t          address; +    uint8_t          *data; +    uint32_t          size; +    int32_t           error; +    uint32_t          regs; +    uint32_t          bits; +    uint32_t          width; +    uint32_t          count; +    uint32_t          base; +    uint32_t          map[EC_GF_MAX_REGS]; +    gf_boolean_t      linear; +    uint64_t          loop; +    ec_code_op_t      ops[0]; +}; + +struct _ec_code_chunk { +    struct list_head  list; +    size_t            size; +    ec_code_space_t  *space; +}; + +struct _ec_code_space { +    struct list_head  list; +    struct list_head  chunks; +    ec_code_t        *code; +    size_t            size; +}; + + +union _ec_code_func { +    ec_code_func_linear_t      linear; +    ec_code_func_interleaved_t interleaved; +}; + +struct _ec_matrix_row { +    ec_code_func_t  func; +    uint32_t       *values; +}; + +struct _ec_matrix { +    struct list_head lru; +    uint32_t         refs; +    uint32_t         columns; +    uint32_t         rows; +    uintptr_t        mask; +    ec_code_t       *code; +    uint32_t        *values; +    ec_matrix_row_t  row_data[0]; +}; + +struct _ec_matrix_list { +    struct list_head   lru; +    gf_lock_t          lock; +    uint32_t           columns; +    uint32_t           rows; +    uint32_t           max; +    uint32_t           count; +    uint32_t           width; +    uint32_t           stripe; +    struct mem_pool   *pool; +    ec_gf_t           *gf; +    ec_code_t         *code; +    ec_matrix_t       *encode; +    ec_matrix_t      **objects; +}; + +struct _ec_heal { +    struct list_head  list; +    gf_lock_t         lock; +    xlator_t         *xl; +    ec_fop_data_t    *fop; +    void             *data; +    ec_fop_data_t    *lookup; +    loc_t             loc; +    struct iatt       iatt; +    char             *symlink; +    fd_t             *fd; +    int32_t           partial; +    int32_t           done; +    int32_t           error; +    gf_boolean_t      nameheal; +    uintptr_t         available; +    uintptr_t         good; +    uintptr_t         bad; +    uintptr_t         open; +    uintptr_t         fixed; +    uint64_t          offset; +    uint64_t          size; +    uint64_t          total_size; +    uint64_t          version[2]; +    uint64_t          raw_size; +}; + +struct subvol_healer { +        xlator_t        *this; +        int              subvol; +        gf_boolean_t     local; +        gf_boolean_t     running; +        gf_boolean_t     rerun; +        pthread_mutex_t  mutex; +        pthread_cond_t   cond; +        pthread_t        thread; +}; + +struct _ec_self_heald { +        gf_boolean_t            iamshd; +        gf_boolean_t            enabled; +        int                     timeout; +        uint32_t                max_threads; +        uint32_t                wait_qlength; +        struct subvol_healer   *index_healers; +        struct subvol_healer   *full_healers; +}; + +struct _ec { +    xlator_t          *xl; +    int32_t            healers; +    int32_t            heal_waiters; +    int32_t            nodes; +    int32_t            bits_for_nodes; +    int32_t            fragments; +    int32_t            redundancy; +    uint32_t           fragment_size; +    uint32_t           stripe_size; +    int32_t            up; +    uint32_t           idx; +    uint32_t           xl_up_count; +    uintptr_t          xl_up; +    uint32_t           xl_notify_count; +    uintptr_t          xl_notify; +    uintptr_t          node_mask; +    xlator_t         **xl_list; +    gf_lock_t          lock; +    gf_timer_t        *timer; +    gf_boolean_t       shutdown; +    gf_boolean_t       eager_lock; +    uint32_t           background_heals; +    uint32_t           heal_wait_qlen; +    struct list_head   pending_fops; +    struct list_head   heal_waiting; +    struct list_head   healing; +    struct mem_pool   *fop_pool; +    struct mem_pool   *cbk_pool; +    struct mem_pool   *lock_pool; +    ec_self_heald_t    shd; +    char               vol_uuid[UUID_SIZE + 1]; +    dict_t            *leaf_to_subvolid; +    ec_read_policy_t   read_policy; +    ec_matrix_list_t   matrix; +}; + +#endif /* __EC_TYPES_H__ */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 659e3fd8108..2aff4374b82 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es> +  Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -12,13 +12,15 @@  #include "statedump.h"  #include "compat-errno.h" +#include "ec.h" +#include "ec-messages.h"  #include "ec-mem-types.h" +#include "ec-types.h"  #include "ec-helpers.h"  #include "ec-common.h"  #include "ec-fops.h"  #include "ec-method.h" -#include "ec.h" -#include "ec-messages.h" +#include "ec-code.h"  #include "ec-heald.h"  #include "events.h" @@ -27,6 +29,7 @@ static char *ec_read_policies[EC_READ_POLICY_MAX + 1] = {          [EC_GFID_HASH] = "gfid-hash",          [EC_READ_POLICY_MAX] = NULL  }; +  #define EC_MAX_FRAGMENTS EC_METHOD_MAX_FRAGMENTS  /* The maximum number of nodes is derived from the maximum allowed fragments   * using the rule that redundancy cannot be equal or greater than the number @@ -207,6 +210,9 @@ void __ec_destroy_private(xlator_t * this)          if (ec->leaf_to_subvolid)                  dict_unref (ec->leaf_to_subvolid); + +        ec_method_fini(&ec->matrix); +          GF_FREE(ec);      }  } @@ -255,8 +261,12 @@ reconfigure (xlator_t *this, dict_t *options)  {          ec_t     *ec              = this->private;          char     *read_policy     = NULL; +        char     *extensions      = NULL;          uint32_t heal_wait_qlen   = 0;          uint32_t background_heals = 0; +        int32_t  ret              = -1; + +        GF_OPTION_RECONF ("cpu-extensions", extensions, options, str, failed);          GF_OPTION_RECONF ("self-heal-daemon", ec->shd.enabled, options, bool,                            failed); @@ -272,17 +282,24 @@ reconfigure (xlator_t *this, dict_t *options)                            int32, failed);          ec_configure_background_heal_opts (ec, background_heals,                                             heal_wait_qlen); -        GF_OPTION_RECONF ("read-policy", read_policy, options, str, failed); -        if (ec_assign_read_policy (ec, read_policy)) -                goto failed;          GF_OPTION_RECONF ("shd-max-threads", ec->shd.max_threads,                            options, uint32, failed);          GF_OPTION_RECONF ("shd-wait-qlength", ec->shd.wait_qlength,                            options, uint32, failed); -        return 0; +        GF_OPTION_RECONF ("read-policy", read_policy, options, str, failed); + +        ret = 0; +        if (ec_assign_read_policy (ec, read_policy)) { +                ret = -1; +        } + +        if (!ec_method_update(this, &ec->matrix, extensions)) { +                ret = -1; +        } +  failed: -        return -1; +        return ret;  }  glusterfs_event_t @@ -554,6 +571,7 @@ init (xlator_t *this)  {      ec_t *ec          = NULL;      char *read_policy = NULL; +    char *extensions  = NULL;      if (this->parents == NULL)      { @@ -608,7 +626,16 @@ init (xlator_t *this)          goto failed;      } -    ec_method_initialize(); +    GF_OPTION_INIT("cpu-extensions", extensions, str, failed); + +    if (!ec_method_init(this, &ec->matrix, ec->fragments, ec->nodes, +                        ec->nodes * 2, extensions)) { +        gf_msg (this->name, GF_LOG_ERROR, 0, EC_MSG_MATRIX_FAILED, +                "Failed to initialize matrix management"); + +        goto failed; +    } +      GF_OPTION_INIT ("self-heal-daemon", ec->shd.enabled, bool, failed);      GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);      GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed); @@ -1402,5 +1429,13 @@ struct volume_options options[] =        .description = "This option can be used to control number of heals"                       " that can wait in SHD per subvolume"      }, +    { +        .key = { "cpu-extensions" }, +        .type = GF_OPTION_TYPE_STR, +        .value = { "none", "auto", "x64", "sse", "avx" }, +        .default_value = "auto", +        .description = "force the cpu extensions to be used to accelerate the " +                       "galois field computations." +    },      { }  }; diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 49af5c2daf2..648d444f595 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -11,11 +11,6 @@  #ifndef __EC_H__  #define __EC_H__ -#include "xlator.h" -#include "timer.h" -#include "ec-heald.h" -#include "libxlator.h" -  #define EC_XATTR_PREFIX  "trusted.ec."  #define EC_XATTR_CONFIG  EC_XATTR_PREFIX"config"  #define EC_XATTR_SIZE    EC_XATTR_PREFIX"size" @@ -26,49 +21,4 @@  #define EC_VERSION_SIZE 2  #define EC_SHD_INODE_LRU_LIMIT          10 -typedef enum { -        EC_ROUND_ROBIN, -        EC_GFID_HASH, -        EC_READ_POLICY_MAX -} ec_read_policy_t; - -struct _ec -{ -    xlator_t *        xl; -    int32_t           healers; -    int32_t           heal_waiters; -    int32_t           nodes; -    int32_t           bits_for_nodes; -    int32_t           fragments; -    int32_t           redundancy; -    uint32_t          fragment_size; -    uint32_t          stripe_size; -    int32_t           up; -    uint32_t          idx; -    uint32_t          xl_up_count; -    uintptr_t         xl_up; -    uint32_t          xl_notify_count; -    uintptr_t         xl_notify; -    uintptr_t         node_mask; -    xlator_t **       xl_list; -    gf_lock_t         lock; -    gf_timer_t *      timer; -    gf_boolean_t      shutdown; -    gf_boolean_t      eager_lock; -    uint32_t          background_heals; -    uint32_t          heal_wait_qlen; -    struct list_head  pending_fops; -    struct list_head  heal_waiting; -    struct list_head  healing; -    struct mem_pool * fop_pool; -    struct mem_pool * cbk_pool; -    struct mem_pool * lock_pool; -    ec_self_heald_t   shd; -    char              vol_uuid[UUID_SIZE + 1]; -    dict_t           *leaf_to_subvolid; -    ec_read_policy_t  read_policy; -}; - -void ec_pending_fops_completed(ec_t *ec); -  #endif /* __EC_H__ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index d87082e9e89..53e8a441a2f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3064,6 +3064,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = GD_OP_VERSION_3_9_0,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key         = "disperse.cpu-extensions", +          .voltype     = "cluster/disperse", +          .op_version  = GD_OP_VERSION_3_9_0, +          .flags       = OPT_FLAG_CLIENT_OPT +        },          { .key        = "cluster.use-compound-fops",            .voltype    = "cluster/replicate",            .value      = "off",  | 
