#include <stdint.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <immintrin.h>
using namespace std;
#define FENC_STRIDE 64
typedef unsigned char uint8_t;
typedef uint8_t pixel;
// /* Original */
// template <int lx, int ly>
// void sad_x4(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5, intptr_t frefstride, int32_t *res)
// {
// res[0] = 0;
// res[1] = 0;
// res[2] = 0;
// res[3] = 0;
// for (int y = 0; y < ly; y++)
// {
// for (int x = 0; x < lx; x++)
// {
// res[0] += abs(pix1[x] - pix2[x]);
// res[1] += abs(pix1[x] - pix3[x]);
// res[2] += abs(pix1[x] - pix4[x]);
// res[3] += abs(pix1[x] - pix5[x]);
// }
// pix1 += FENC_STRIDE;
// pix2 += frefstride;
// pix3 += frefstride;
// pix4 += frefstride;
// pix5 += frefstride;
// }
// }
/* SIMD */
template <int lx, int ly>
void sad_x4(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5, intptr_t frefstride, int32_t *res)
{
// __mm(256)_(sad_x4)_(epu8)
res[0] = 0;
res[1] = 0;
res[2] = 0;
res[3] = 0;
for (int y = 0; y < ly; y++)
{
__m256i_u _pix1 = _mm256_loadu_si256((__m256i_u *)pix1); // TODO: Doesn't work
int *_test = (int *)&_pix1;
for (int x = 0; x < lx; x++)
{
cout << "\n"
<< _test[x];
}
for (int x = 0; x < lx; x++)
{
res[0] += abs(pix1[x] - pix2[x]);
}
for (int x = 0; x < lx; x++)
{
res[1] += abs(pix1[x] - pix3[x]);
}
for (int x = 0; x < lx; x++)
{
res[2] += abs(pix1[x] - pix4[x]);
}
for (int x = 0; x < lx; x++)
{
res[3] += abs(pix1[x] - pix5[x]);
}
pix1 += FENC_STRIDE;
pix2 += frefstride;
pix3 += frefstride;
pix4 += frefstride;
pix5 += frefstride;
}
}
int main()
{
int32_t res[4];
pixel a[] = {1, 2, 3, 4, 5, 6, 7, 8};
clock_t start, end;
start = clock();
sad_x4<8, 8>(a, a, a, a, a, 0, res);
end = clock();
double time_taken = double(end - start) / double(CLOCKS_PER_SEC);
cout << "\nTime taken: " << time_taken << "s\n";
}