HPC101 Lab 2.5

1. code

environment: wsl2

source

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <smmintrin.h>
#include <emmintrin.h>
#include <immintrin.h>

#define MAXN 100000000

float a[MAXN];
float b[MAXN];
float c[MAXN];
float d[MAXN];

int main()
{
    for (int i = 0; i < MAXN; ++i)
    {
        a[i] = 1.0 / (rand() + 1);
        b[i] = 1.0 / (rand() + 1);
    }

    clock_t start_std, end_std;
    start_std = clock();
    for (int n = 0; n < 20; ++n)
    {
        for (int i = 0; i < MAXN; ++i)
        {
            d[i] += a[i] * b[i];
        }
    }
    end_std = clock();

    clock_t start, end;
    start = clock();
    for (int n = 0; n < 20; ++n)
    {
        int top=0,mxn=MAXN>>3;
        for (int i=0;i<mxn;++i) {
            __m256 a256=_mm256_load_ps(a+top);
            __m256 b256=_mm256_load_ps(b+top);
            __m256 c256=_mm256_load_ps(c+top);
            c256=_mm256_fmadd_ps(a256,b256,c256);
            _mm256_store_ps(c+top,c256);
            top+=8;
        }
    }

    end = clock();
    float t=(double)(end - start) / CLOCKS_PER_SEC;
    float t_std=(double)(end_std-start_std)/CLOCKS_PER_SEC;
    printf("time=%f\n", t);
    printf("std_time=%f\n", t_std);
    printf("speed up: %fx\n",t_std/t);
    

    for (int i = 0; i < MAXN; ++i)
    {
        if (fabs(c[i] - d[i]) / d[i] > 0.0001)
        {
            printf("Check Failed at %d\n", i);
            return 0;
        }
    }
    printf("Check Passed");
    return 0;
}

compile
1
gcc add.cpp -o add -mfma -mavx2

2. result

time=2.037191
std_time=4.855074
speed up: 2.383220x
Check Passed

I carried out several tests and got the following result:

id	acceleration ratio	accuracy
1	2.383x	Passed
2	2.311x	Passed
3	2.314x	Passed
avg	2.336x	100%

3. idea

use the data type __m256 to process 8 single-precision floating number at a time.
(__m256 has 32 byte to contain eight 4-byte float)

top : the position of the foremost of unprocessed data

Firstly, load the data from arrays a b c, using the function _mm256_load_ps(const float *__P)

1
2
3

__m256 a256=_mm256_load_ps(a+top);
__m256 b256=_mm256_load_ps(b+top);
__m256 c256=_mm256_load_ps(c+top);

Then add a256*b256 to c256, using the function _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
1
c256=_mm256_fmadd_ps(a256,b256,c256);
In the end, update c, using the function _mm256_store_ps(float *__P, __m256 __A)
1
_mm256_store_ps(c+top,c256);
update top
1
top+=8 // 8 = 32/4

4. other tests

use __m128 to replace __m256

for (int n = 0; n < 20; ++n)
{
    int top=0,mxn=MAXN>>2;
    for (int i=0;i<mxn;++i) {
        __m128 a128=_mm_load_ps(a+top);
        __m128 b128=_mm_load_ps(b+top);
        __m128 c128=_mm_load_ps(c+top);
        c128=_mm_fmadd_ps(a128,b128,c128);
        _mm_store_ps(c+top,c128);
        top+=4;  // 4=128/32
    }
}

And got the result as follows:

id	acceleration ratio	accuracy
1	1.369x	Passed
2	1.384x	Passed
3	1.313x	Passed
avg	1.355x	100%

From the result we find that the acceleration ratio of __m256 is 1.72 times as that of __m128, that’s because CPU can handle 2 times of data in single instruction with __m256.

use an array to reduce the times of type conversions
P.S This way we need add a global array, but it is against the rule. However it didn’t save a[i]*b[i] so it will keep the total calculation amount unchanged.

// ...

__m256 c256[MAXN/8];

int main()
{
    // ...
    
    for (int n = 0; n < 20; ++n)
    {
        int mxn=MAXN>>3;
        for (int i=0;i<mxn;++i) {
            __m256 a256=_mm256_load_ps(a+(i<<3));
            __m256 b256=_mm256_load_ps(b+(i<<3));
            if (!n) c256[i]=_mm256_mul_ps(a256,b256);
            else c256[i]=_mm256_fmadd_ps(a256,b256,c256[i]);
        }
        if (n==19) {
            for (int i=0;i<mxn;++i) {
                _mm256_store_ps(c+(i<<3),c256[i]);
            }
        }
    }
	// ...
}

Got the result as follows:

id	acceleration ratio	accuracy
1	2.873x	Passed
2	2.877x	Passed
3	2.888x	Passed
avg	2.879x	100%

The acceleration ratio increases, but it also use more memory. When I want to set a256[MAXN/8] b256[MAXN/8]to further reduce the times of type conversions, an error occurs.

Maybe it is caused by memory excess.

5. assembler

normal

movl    -336(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    d(%rip), %rax
        vmovss  (%rdx,%rax), %xmm1
        movl    -336(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    a(%rip), %rax
        vmovss  (%rdx,%rax), %xmm2
        movl    -336(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    b(%rip), %rax
        vmovss  (%rdx,%rax), %xmm0
        vmulss  %xmm0, %xmm2, %xmm0
        vaddss  %xmm0, %xmm1, %xmm0
        movl    -336(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    d(%rip), %rax
        vmovss  %xmm0, (%rdx,%rax)
        addl    $1, -336(%rbp)

SIMD

movl    -328(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    a(%rip), %rax
        addq    %rdx, %rax
        movq    %rax, -248(%rbp)
        movq    -248(%rbp), %rax
        vmovaps (%rax), %ymm0
        vmovaps %ymm0, -240(%rbp)

movl    -328(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    b(%rip), %rax
        addq    %rdx, %rax
        movq    %rax, -256(%rbp)
        movq    -256(%rbp), %rax
        vmovaps (%rax), %ymm0
        vmovaps %ymm0, -208(%rbp)

movl    -328(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    c(%rip), %rax
        addq    %rdx, %rax
        movq    %rax, -264(%rbp)
        movq    -264(%rbp), %rax

vmovaps (%rax), %ymm0
        vmovaps %ymm0, -176(%rbp)
        vmovaps -240(%rbp), %ymm0
        vmovaps %ymm0, -112(%rbp)
        vmovaps -208(%rbp), %ymm0
        vmovaps %ymm0, -80(%rbp)
        vmovaps -176(%rbp), %ymm0
        vmovaps %ymm0, -48(%rbp)
        vmovaps -80(%rbp), %ymm1
        vmovaps -48(%rbp), %ymm0
        vfmadd231ps     -112(%rbp), %ymm1, %ymm0
        nop
        vmovaps %ymm0, -176(%rbp)
        movl    -328(%rbp), %eax
        cltq
        leaq    0(,%rax,4), %rdx
        leaq    c(%rip), %rax
        addq    %rdx, %rax
        movq    %rax, -272(%rbp)
        vmovaps -176(%rbp), %ymm0
        vmovaps %ymm0, -144(%rbp)
        movq    -272(%rbp), %rax
        vmovaps -144(%rbp), %ymm0
        vmovaps %ymm0, (%rax)
        nop
        addl    $8, -328(%rbp)
        addl    $1, -324(%rbp)

But I can’t understand the assembly code…

6. note

data type

name	element	size
`__m128`	4*`float`	4*32 bit
`__m128d`	2*`double`	2*64 bit
`__m128i`	various type of integer	128 bit
`__m256`	8*`float`	8*32 bit
`__m256d`	4*`double`	4*64 bit
`__m256i`	various type of integer	256 bit

int,short,char,long long and other integer-like type can be loaded into __m128i and __m256i.

functions

_mm<bit_width>_<name>_<data_type>

<bit_width> is the size of value this function returns. (like 256 and 512)
if it is empty, it means the size of value is 128 bit
<name> indicates the function’s function.
- <abs>, <add>, <and>, <andnot>,<avg>, <sub>, <mul>, <or>, <max>, <min>, <xor> : obviously indicates its function
- <fmadd>, <fmsub> : take 3 parameters a, b, c, and return a*b+c or a*b-c
- <fnmadd>, <fnmsub> : take 3 parameters a, b, c, and return -a*b+c or -a*b-c
- <broadcast> : take 1 parameters a, broadcast the data from a to all elements of returned value.
- <cmpeq>: Compare packed data in a and b for equality, and return the result. (all 1=equal, all 0=unequal)
- <cmpgt> : Compare packed data in a and b, and return the greater one.
- …
<data_type> tells the function what type of input data it should be treated as.
- pd : double
- ps : float
- ph : half-precision floating-point
- epi<x> : signed x-bit integer
- epu<x> : unsigned x-bit integer
- si<x> : a pack of x bits integer number
- …

reference: