Parallel Computing

OpenCL

Ejemplo utilizando un Kernel de OpenCL


#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#include "mykernel.cl.h"

#define NUM_VALUES 2048000

static int validate(cl_float* input, cl_float* output)
{
    for (int i = 0; i < NUM_VALUES; i++)
    {
        if ( output[i] != (input[i] * input[i]) )
        {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
            
            fprintf(stdout, "       Got %1.4f, EXPECTED %1.4f\n", output[i], input[i] * input[i]);

            fflush(stdout);
            return 0;
        }
    }
    return 1;
}

int main (int argc, const char * argv[])
{
    char name[128];
    dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);

    if (queue == NULL)
    {
        queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
    }

    cl_device_id gpu = gcl_get_device_id_with_dispatch_queue(queue);
    clGetDeviceInfo(gpu, CL_DEVICE_NAME, 128, name, NULL);
    fprintf(stdout, "Created a dispatch queue using the %s\n", name);
 
    float* test_in = (float*) malloc(sizeof(cl_float) * NUM_VALUES);

    for (int i = 0; i < NUM_VALUES; i++)
    {
        test_in[i] = (cl_float) i;
    }

    float* test_out = (float*)malloc(sizeof(cl_float) * NUM_VALUES);
    
    // Kernel space
    void* mem_in  = gcl_malloc(sizeof(cl_float) * NUM_VALUES, test_in, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
    void* mem_out = gcl_malloc(sizeof(cl_float) * NUM_VALUES, NULL, CL_MEM_WRITE_ONLY);

    dispatch_sync(queue, ^{
        size_t wgs;
        gcl_get_kernel_block_workgroup_info(square_kernel, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgs), &wgs, NULL);

        cl_ndrange range = {
            1,                     // The number of dimensions to use.
            {0, 0, 0},             // The offset in each dimension.
            {NUM_VALUES, 0, 0},    // The global range —how many items in each dimension
            {wgs, 0, 0}            // The local size of each workgroup.
        };

        auto t1 = std::chrono::high_resolution_clock::now();
        square_kernel(&range,(cl_float*)mem_in, (cl_float*)mem_out);
        auto t2 = std::chrono::high_resolution_clock::now();

        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
        printf("%lld milliseconds\n", duration);
        
        gcl_memcpy(test_out, mem_out, sizeof(cl_float) * NUM_VALUES);
    });
   
    // Check to see if the kernel did what it was supposed to:
    if ( validate(test_in, test_out))
    {
        fprintf(stdout, "All values were properly squared.\n");
    }
    
    // Don't forget to free up the CL device's memory when you're done.
    gcl_free(mem_in);
    gcl_free(mem_out);

    // And the same goes for system memory, as usual.
    free(test_in);
    free(test_out);
 
    // Finally, release your queue just as you would any GCD queue.
    dispatch_release(queue);
}
// Kernel block
kernel void square(global float* input, global float* output)
{
    size_t i = get_global_id(0);
    output[i] = input[i] * input[i];
}

Matrices, descomposición en matrices LU

El siguiente programa descompone la matriz original en las matrices LU utilizando instrucciones vectorizadas, validando los resultados.

#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#include "mykernel.cl.h"

#define WIDTH 2
#define NUM_VALUES 4
#define INDX_POS(i,j) ((WIDTH * i) + (j))

void print(char* n, float* m)
{
    fprintf(stdout, "\n%s\n", n);

    for(int i = 0; i < NUM_VALUES; i++)
    {
        fprintf(stdout, "%.3f ", m[i]);
        if (i % (WIDTH) == WIDTH - 1) { fprintf(stdout, "\n"); }
    }
}
 
int validate(float* input, float* output)
{
    for (int i = 0; i < WIDTH; i++) {
        if ( input[i] != output[i] ) {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
            fflush(stdout);
            return 0;
        }
    }
    return 1;
}
 
void createL(float* output, float* l)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(i == j)
            {
                l[INDX_POS(i,j)] = 1;
            }
            else if (i < j)
            {
                l[INDX_POS(i,j)] = 0;
            }
            else if(i > j)
            {
                l[INDX_POS(i,j)] = output[INDX_POS(i,j)];
            }
        }
    }
}
 
void createU(float* output, float* u)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(j >= i)
            {
                u[INDX_POS(i,j)] = output[INDX_POS(i,j)];
            }
        }
    }
}

int main (int argc, const char * argv[])
{
    char name[128];
 
    dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);

    if (queue == NULL)
    {
        queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
    }

    cl_device_id gpu = gcl_get_device_id_with_dispatch_queue(queue);
    clGetDeviceInfo(gpu, CL_DEVICE_NAME, 128, name, NULL);
    fprintf(stdout, "Created a dispatch queue using the %s\n", name);
 
    float* l = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* u = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* test_in = (float*) malloc(sizeof(cl_float) * NUM_VALUES);
    float* test_out = (float*) malloc(sizeof(cl_float) * NUM_VALUES);
    
    for (int i = 0; i < NUM_VALUES; i++)
    {
        l[i] = 0;
        u[i] = 0;
        test_out[i] = 0;
        test_in[i] = i;
    }
    
    // i: [index / NUM_VALUES/2] -- j: [index % NUM_VALUES/2]
    //   A         L       U
    // 4   3    1     0   4   3
    // 6   3    1.5   1   0   -1.5
    
    //   A         L       U
    // 3   1    1     0   3   1
    // 4   2    1.3   1   0   0.6
    
    test_in[0] = 4; // 0 0
    test_in[1] = 3; // 0 1
    test_in[2] = 6; // 1 0
    test_in[3] = 3; // 1 1
    
    //     A            L            U
    // 1   2   3    1   0   0    1   2   3
    // 4   5   6    4   1   0    0  -3  -6
    // 7   8   9    7   2   1    0   0   0
    
    /*
    test_in[0] = 80.428;
    test_in[1] = -12.818;
    test_in[2] = -81.284;
    test_in[3] = -95.437;
    test_in[4] = -79.099;
    test_in[5] = 14.754;
    test_in[6] = 80.749;
    test_in[7] = 46.408;
    test_in[8] = -28.549;
    test_in[9] = 76.515;
    test_in[10] = -14.687;
    test_in[11] = -75.622;
    test_in[12] = -88.490;
    test_in[13] = -82.585;
    test_in[14] = 85.373;
    test_in[15] = -61.574;
    */
    
    print("INPUT", test_in);
 
    // Kernel space
    void* mem_in  = gcl_malloc(sizeof(cl_float) * NUM_VALUES, test_in, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
    void* mem_out = gcl_malloc(sizeof(cl_float) * NUM_VALUES, NULL, CL_MEM_WRITE_ONLY);
 
    dispatch_sync(queue, ^{
        size_t wgs;
        gcl_get_kernel_block_workgroup_info(ludecomposition_kernel, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgs), &wgs, NULL);
    
        cl_ndrange range = {
            1,                     // The number of dimensions to use.
            {0, 0, 0},             // The offset in each dimension.
            {NUM_VALUES, 0, 0},    // The global range —how many items in each dimension
            {wgs, 0, 0}            // The local size of each workgroup.
        };
        
        auto t1 = std::chrono::high_resolution_clock::now();

        ludecomposition_kernel(&range,(cl_float*)mem_in, (cl_float*)mem_out);

        auto t2 = std::chrono::high_resolution_clock::now();
           
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();

        printf("%lld milliseconds\n", duration);
           
        gcl_memcpy(test_out, mem_out, sizeof(cl_float) * NUM_VALUES);
    });
   
    createL(test_in, l);
    createU(test_in, u);
    
    print("L", l);
    print("U", u);
    
    double sum = 0;
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            for(int k = 0; k < WIDTH; k++)
            {
                //test_out[i][j] += l[i][k] * u[k][j];
                sum = sum + l[INDX_POS(i,k)] * u[INDX_POS(k,j)];
            }
            test_out[INDX_POS(i,j)] = sum;
            sum = 0;
        }
    }
    
    print("OUTPUT", test_out);
    
    if ( validate(test_in, test_out))
    {
        fprintf(stdout, "\nAll values were properly calculated.\n");
    }
    
    // Don't forget to free up the CL device's memory when you're done.
    gcl_free(mem_in);
 
    // And the same goes for system memory, as usual.
    free(test_in);
    free(l);
    free(u);
    free(test_out);
 
    // Finally, release your queue just as you would any GCD queue.
    dispatch_release(queue);
    
    return 0;
}
/*
 for(int i = 0; i < NUM_VALUES - 1; i++)
 {
     int row = 0;
     
     for(row = i + 1; row < NUM_VALUES; row++)
     {
         float factor = input[INDX_POS(row,i)] / input[INDX_POS(i,i)];
         
         for(int col = i + 1; col < NUM_VALUES; col++)
         {
             input[INDX_POS(row,col)] = input[INDX_POS(row,col)] - factor * input[INDX_POS(i,col)];
         }
         
         input[INDX_POS(row,i)] = factor;
     }
 }
 */
 
#define WIDTH 2
#define NUM_VALUES 4
#define INDX_POS(i,j) ((WIDTH * i) + (j))
 
kernel void ludecomposition(global float* input, global float* output)
{
    size_t i = get_global_id(0);
    size_t j = get_global_id(1);
}

OpenMP

Ejemplo de OpenMP

#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include "omp.h"
 
#define NUM_VALUES 2048000
static int validate(float* input, float* output)
{
    for (int i = 0; i < NUM_VALUES; i++)
    {
        if ( output[i] != (input[i] * input[i]) )
        {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
            fprintf(stdout, "       Got %1.4f, EXPECTED %1.4f\n", output[i], input[i] * input[i]);
            fflush(stdout);

            return 0;
        }
    }
    return 1;
}
 
void square(float* input, float* output)
{
    for (int i = 0; i < NUM_VALUES; i++)
    {
        output[i] = input[i] * input[i];
    }
}
 
int main(int argc, const char * argv[])
{
    float* test_in = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* test_out = (float*) malloc(sizeof(float) * NUM_VALUES);
    
    for (int i = 0; i < NUM_VALUES; i++)
    {
        test_in[i] = i;
    }
    auto t1 = std::chrono::high_resolution_clock::now();

    #pragma omp parallel
    {
        square(test_in, test_out);
    }

    auto t2 = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();

    std::cout << duration << " milliseconds\n";
    //std::cout << "Greetings from thread " << omp_get_thread_num() << std::endl;
    
    if ( validate(test_in, test_out)) {
        fprintf(stdout, "All values were properly squared.\n");
    }
    
    return 0;
}

Descomposición de una Matriz en matrices LU utilizando instrucciones vectorizadas, validando los resultados.

Se medirá el tiempo de ejecución en cada caso.

Matrices LU – Simple

#include <chrono>
#include <stdio.h>
#include <stdlib.h>
 
#define WIDTH 4
#define NUM_VALUES 16
#define INDX_POS(i,j) ((WIDTH * i) + (j))
 
void print(char* n, float* m)
{
    fprintf(stdout, "\n%s\n", n);
    
    for(int i = 0; i < NUM_VALUES; i++)
    {
        fprintf(stdout, "%.3f ", m[i]);

        if (i % (WIDTH) == WIDTH - 1) { fprintf(stdout, "\n"); }
    }
}
 
int validate(float* input, float* output)
{
    for (int i = 0; i < WIDTH; i++)
    {
        if ( input[i] != output[i] )
        {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
            
            fflush(stdout);
            return 0;
        }
    }
    return 1;
}
 
void createL(float* output, float* l)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(i == j)
            {
                l[INDX_POS(i,j)] = 1;
            }
            else if (i < j)
            {
                l[INDX_POS(i,j)] = 0;
            }
            else if(i > j)
            {
                l[INDX_POS(i,j)] = output[INDX_POS(i,j)];
            }
        }
    }
}
 
void createU(float* output, float* u)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(j >= i)
            {
                u[INDX_POS(i,j)] = output[INDX_POS(i,j)];
            }
        }
    }
}
 
void LUdecomposition(float* input, float* l, float* u)
{
    for(int i = 0; i < NUM_VALUES - 1; i++)
    {
        int row = 0;
        
        for(row = i + 1; row < NUM_VALUES; row++)
        {
            float factor = input[INDX_POS(row,i)] / input[INDX_POS(i,i)];
            
            for(int col = i + 1; col < NUM_VALUES; col++)
            {
                input[INDX_POS(row,col)] = input[INDX_POS(row,col)] - factor * input[INDX_POS(i,col)];
            }            

            input[INDX_POS(row,i)] = factor;
        }
    }
    
    createL(input, l);
    createU(input, u);
}
 
int main(int argc, const char * argv[])
{
    float* test_in = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* test_out = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* l = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* u = (float*) malloc(sizeof(float) * NUM_VALUES);
    
    for (int i = 0; i < NUM_VALUES; i++)
    {
        l[i] = 0;
        u[i] = 0;
        test_out[i] = 0;
        test_in[i] = i + 1;
    }
    
    // i: [index / NUM_VALUES/2] -- j: [index % NUM_VALUES/2]
    //   A         L       U
    // 4   3    1     0   4   3
    // 6   3    1.5   1   0   -1.5
    
    //   A         L       U
    // 3   1    1     0   3   1
    // 4   2    1.3   1   0   0.6
    
    //test_in[0] = 4; // 0 0
    //test_in[1] = 3; // 0 1
    //test_in[2] = 6; // 1 0
    //test_in[3] = 3; // 1 1
    
    //     A            L            U
    // 1   2   3    1   0   0    1   2   3
    // 4   5   6    4   1   0    0  -3  -6
    // 7   8   9    7   2   1    0   0   0
    
    test_in[0] = 80.428;
    test_in[1] = -12.818;
    test_in[2] = -81.284;
    test_in[3] = -95.437;
    test_in[4] = -79.099;
    test_in[5] = 14.754;
    test_in[6] = 80.749;
    test_in[7] = 46.408;
    test_in[8] = -28.549;
    test_in[9] = 76.515;
    test_in[10] = -14.687;
    test_in[11] = -75.622;
    test_in[12] = -88.490;
    test_in[13] = -82.585;
    test_in[14] = 85.373;
    test_in[15] = -61.574;
    
    print("INPUT", test_in);
    
    auto t1 = std::chrono::high_resolution_clock::now();

    LUdecomposition(test_in, l, u);

    auto t2 = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
    
    print("L", l);
    print("U", u);
    
    double sum = 0;
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            for(int k = 0; k < WIDTH; k++)
            {
                //test_out[i][j] += l[i][k] * u[k][j];
                sum = sum + l[INDX_POS(i,k)] * u[INDX_POS(k,j)];
            }

            test_out[INDX_POS(i,j)] = sum;
            sum = 0;
        }
    }
    
    print("OUTPUT", test_out);
    
    if ( validate(test_in, test_out))
    {
        fprintf(stdout, "\nAll values were properly calculated.\n");
    }

    fprintf(stdout, "\n%llu milliseconds\n", duration);

    return 0;
}

Matrices LU – Intrinsics

#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>
#include "omp.h"
 
#define WIDTH 4
#define NUM_VALUES 16
#define INDX_POS(i,j) ((WIDTH * i) + (j))
 
void print(char* n, float* m)
{
    fprintf(stdout, "\n%s\n", n);

    for(int i = 0; i < NUM_VALUES; i++)
    {
        fprintf(stdout, "%.3f ", m[i]);
        if (i % (WIDTH) == WIDTH - 1) { fprintf(stdout, "\n"); }
    }
}
 
int validate(float* input, float* output)
{
    for (int i = 0; i < WIDTH; i++)
    {
        if ( input[i] != output[i] )
        {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
            fflush(stdout);
            return 0;
        }
    }
    return 1;
}
 
void createL(float* output, float* l)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(i == j)
            {
                //l[INDX_POS(i,j)] = 1;
                __m128 result = _mm_set1_ps(1.0);
                _mm_storeu_ps(&l[INDX_POS(i,j)], result);
            }
            else if (i < j)
            {
                //l[INDX_POS(i,j)] = 0;
                __m128 result = _mm_setzero_ps();
                _mm_storeu_ps(&l[INDX_POS(i,j)], result);
            }
            else if(i > j)
            {
                //l[INDX_POS(i,j)] = output[INDX_POS(i,j)];
                __m128 result = _mm_loadu_ps(&output[INDX_POS(i,j)]);
                _mm_storeu_ps(&l[INDX_POS(i,j)], result);
            }
        }
    }
}
 
void createU(float* output, float* u)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(j >= i)
            {
                //u[INDX_POS(i,j)] = output[INDX_POS(i,j)];
                __m128 result = _mm_loadu_ps(&output[INDX_POS(i,j)]);
                _mm_storeu_ps(&u[INDX_POS(i,j)], result);
            }
        }
    }
}
 
void LUdecomposition(float* input, float* l, float* u)
{
    for(int i = 0; i < NUM_VALUES - 1; i++)
    {
        int row = 0;
        
//        #pragma omp parallel for private(row) shared(input)
        //{
            for(row = i + 1; row < NUM_VALUES; row++)
            {
                __m128 in_rowi_value = _mm_loadu_ps(&input[INDX_POS(row,i)]);
                __m128 in_ii_value = _mm_loadu_ps(&input[INDX_POS(i,i)]);
                __m128 factor = _mm_div_ps(in_rowi_value, in_ii_value);
                                
                //float factor = input[INDX_POS(row,i)] / input[INDX_POS(i,i)];
                
                for(int col = i + 1; col < NUM_VALUES; col++)
                {
                    __m128 in_rowcol_value = _mm_loadu_ps(&input[INDX_POS(row,col)]);
                    __m128 in_icol_value = _mm_loadu_ps(&input[INDX_POS(i,col)]);
                    __m128 mult_value = _mm_mul_ps(factor, in_icol_value);
                    __m128 result = _mm_sub_ps(in_rowcol_value, mult_value);

                    _mm_storeu_ps(&input[INDX_POS(row,col)], result);
                    //input[INDX_POS(row,col)] = input[INDX_POS(row,col)] - factor * input[INDX_POS(i,col)];
                }
                
                //input[INDX_POS(row,i)] = factor;
                _mm_storeu_ps(&input[INDX_POS(row,i)], factor);
            }
        //}
    }
    
    //#pragma omp parallel
    //{
        createL(input, l);
        createU(input, u);
    //}
}
 
int main(int argc, const char * argv[])
{
    float* test_in = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* test_out = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* l = (float*) malloc(sizeof(float) * NUM_VALUES);

    float* u = (float*) malloc(sizeof(float) * NUM_VALUES);
    
    for (int i = 0; i < NUM_VALUES; i++)
    {
        l[i] = 0;
        u[i] = 0;
        test_out[i] = 0;
        test_in[i] = i + 1;
    }
    
    // i: [index / NUM_VALUES/2] -- j: [index % NUM_VALUES/2]
    //   A         L       U
    // 4   3    1     0   4   3
    // 6   3    1.5   1   0   -1.5
    
    //   A         L       U
    // 3   1    1     0   3   1
    // 4   2    1.3   1   0   0.6
    
    //test_in[0] = 4; // 0 0
    //test_in[1] = 3; // 0 1
    //test_in[2] = 6; // 1 0
    //test_in[3] = 3; // 1 1
    
    //     A            L            U
    // 1   2   3    1   0   0    1   2   3
    // 4   5   6    4   1   0    0  -3  -6
    // 7   8   9    7   2   1    0   0   0
    
    test_in[0] = 80.428;
    test_in[1] = -12.818;
    test_in[2] = -81.284;
    test_in[3] = -95.437;
    test_in[4] = -79.099;
    test_in[5] = 14.754;
    test_in[6] = 80.749;
    test_in[7] = 46.408;
    test_in[8] = -28.549;
    test_in[9] = 76.515;
    test_in[10] = -14.687;
    test_in[11] = -75.622;
    test_in[12] = -88.490;
    test_in[13] = -82.585;
    test_in[14] = 85.373;
    test_in[15] = -61.574;
    
    print("INPUT", test_in);
    
    auto t1 = std::chrono::high_resolution_clock::now();

    LUdecomposition(test_in, l, u);

    auto t2 = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
    
    print("L", l);
    print("U", u);
    
    double sum = 0;
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            for(int k = 0; k < WIDTH; k++)
            {
                //test_out[i][j] += l[i][k] * u[k][j];
                sum = sum + l[INDX_POS(i,k)] * u[INDX_POS(k,j)];
            }
            test_out[INDX_POS(i,j)] = sum;
            sum = 0;
        }
    }
    
    print("OUTPUT", test_out);
    
    if ( validate(test_in, test_out))
    {
        fprintf(stdout, "\nAll values were properly calculated.\n");
    }

    fprintf(stdout, "\n%llu milliseconds\n", duration);
    
    return 0;
}

Matrices LU – Intrinsics Complete

#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>
#include "omp.h"
 
#define WIDTH 4
#define NUM_VALUES 16
#define INDX_POS(i,j) ((WIDTH * i) + (j))
 
void print(char* n, float* m)
{
    fprintf(stdout, "\n%s\n", n);

    for(int i = 0; i < NUM_VALUES; i++)
    {
        fprintf(stdout, "%.3f ", m[i]);

        if (i % (WIDTH) == WIDTH - 1) { fprintf(stdout, "\n"); }
    }
}
 
int validate(float* input, float* output)
{
    for (int i = 0; i < WIDTH; i++)
    {
        if ( input[i] != output[i] )
        {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);

            fflush(stdout);
            return 0;
        }
    }
    return 1;
}
 
void createL(float* output, float* l)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(i == j)
            {
                //l[INDX_POS(i,j)] = 1;
                _mm_storeu_ps(&l[INDX_POS(i,j)], _mm_set1_ps(1.0));
            }
            else if (i < j)
            {
                //l[INDX_POS(i,j)] = 0;
                _mm_storeu_ps(&l[INDX_POS(i,j)], _mm_setzero_ps());
            }
            else if(i > j)
            {
                //l[INDX_POS(i,j)] = output[INDX_POS(i,j)];
                _mm_storeu_ps(&l[INDX_POS(i,j)], _mm_loadu_ps(&output[INDX_POS(i,j)]));
            }
        }
    }
}
 
void createU(float* output, float* u)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(j >= i)
            {
                //u[INDX_POS(i,j)] = output[INDX_POS(i,j)];
                _mm_storeu_ps(&u[INDX_POS(i,j)], _mm_loadu_ps(&output[INDX_POS(i,j)]));
            }
        }
    }
}
 
void LUdecomposition(float* input, float* l, float* u)
{
    for(int i = 0; i < NUM_VALUES - 1; i++)
    {
        int row = 0;
        
        //#pragma omp parallel for private(row) shared(input)
        //{
            for(row = i + 1; row < NUM_VALUES; row++)
            {
                //float factor = input[INDX_POS(row,i)] / input[INDX_POS(i,i)];
                __m128 factor = _mm_div_ps(_mm_loadu_ps(&input[INDX_POS(row,i)]), _mm_loadu_ps(&input[INDX_POS(i,i)]));
                
                for(int col = i + 1; col < NUM_VALUES; col++)
                {
                    __m128 mult_value = _mm_mul_ps(factor, _mm_loadu_ps(&input[INDX_POS(i,col)]));
                    __m128 result = _mm_sub_ps(_mm_loadu_ps(&input[INDX_POS(row,col)]), mult_value);
                    _mm_storeu_ps(&input[INDX_POS(row,col)], result);

                    //input[INDX_POS(row,col)] = input[INDX_POS(row,col)] - factor * input[INDX_POS(i,col)];
                }
                
                //input[INDX_POS(row,i)] = factor;
                _mm_storeu_ps(&input[INDX_POS(row,i)], factor);
            }
        //}
    }
    
    //#pragma omp parallel
    //{
        createL(input, l);
        createU(input, u);
    //}
}
 
int main(int argc, const char * argv[])
{
    float* test_in = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* test_out = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* l = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* u = (float*) malloc(sizeof(float) * NUM_VALUES);
    
    for (int i = 0; i < NUM_VALUES; i++)
    {
        l[i] = 0;
        u[i] = 0;
        test_out[i] = 0;
        test_in[i] = i + 1;
    }
    
    // i: [index / NUM_VALUES/2] -- j: [index % NUM_VALUES/2]
    //   A         L       U
    // 4   3    1     0   4   3
    // 6   3    1.5   1   0   -1.5
    
    //   A         L       U
    // 3   1    1     0   3   1
    // 4   2    1.3   1   0   0.6
    
    //test_in[0] = 4; // 0 0
    //test_in[1] = 3; // 0 1
    //test_in[2] = 6; // 1 0
    //test_in[3] = 3; // 1 
   
    //     A            L            U
    // 1   2   3    1   0   0    1   2   3
    // 4   5   6    4   1   0    0  -3  -6
    // 7   8   9    7   2   1    0   0   0
    
    test_in[0] = 80.428;
    test_in[1] = -12.818;
    test_in[2] = -81.284;
    test_in[3] = -95.437;
    test_in[4] = -79.099;
    test_in[5] = 14.754;
    test_in[6] = 80.749;
    test_in[7] = 46.408;
    test_in[8] = -28.549;
    test_in[9] = 76.515;
    test_in[10] = -14.687;
    test_in[11] = -75.622;
    test_in[12] = -88.490;
    test_in[13] = -82.585;
    test_in[14] = 85.373;
    test_in[15] = -61.574;
    
    print("INPUT", test_in);
    
    auto t1 = std::chrono::high_resolution_clock::now();

    LUdecomposition(test_in, l, u);

    auto t2 = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
    
    print("L", l);
    print("U", u);
    
    __m128 sum = _mm_setzero_ps();
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            for(int k = 0; k < WIDTH; k++)
            {
                //test_out[i][j] += l[i][k] * u[k][j];
                sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(&l[INDX_POS(i,k)]), _mm_loadu_ps(&u[INDX_POS(k,j)])));
            }
            _mm_storeu_ps(&test_out[INDX_POS(i,j)], sum);
            sum = _mm_setzero_ps();
        }
    }
    
    print("OUTPUT", test_out);
    
    if ( validate(test_in, test_out))
    {
        fprintf(stdout, "\nAll values were properly calculated.\n");
    }
    fprintf(stdout, "\n%llu milliseconds\n", duration);
    
    return 0;
}

Matrices LU – OpenMP

#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include "omp.h"
 
#define WIDTH 4
#define NUM_VALUES 16
#define INDX_POS(i,j) ((WIDTH * i) + (j))
 
void print(char* n, float* m)
{
    fprintf(stdout, "\n%s\n", n);

    for(int i = 0; i < NUM_VALUES; i++)
    {
        fprintf(stdout, "%.3f ", m[i]);
        if (i % (WIDTH) == WIDTH - 1) { fprintf(stdout, "\n"); }
    }
}
 
int validate(float* input, float* output)
{
    for (int i = 0; i < WIDTH; i++)
    {
        if ( input[i] != output[i] )
        {
            fprintf(stdout, "Error: Element %d did not match expected output.\n", i);
            fflush(stdout);

            return 0;
        }
    }
    return 1;
}
 
void createL(float* output, float* l)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(i == j)
            {
                l[INDX_POS(i,j)] = 1;
            }
            else if (i < j)
            {
                l[INDX_POS(i,j)] = 0;
            }
            else if(i > j)
            {
                l[INDX_POS(i,j)] = output[INDX_POS(i,j)];
            }
        }
    }
}
 
void createU(float* output, float* u)
{
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            if(j >= i)
            {
                u[INDX_POS(i,j)] = output[INDX_POS(i,j)];
            }
        }
    }
}
 
void LUdecomposition(float* input, float* l, float* u)
{
    for(int i = 0; i < NUM_VALUES - 1; i++)
    {
        int row = 0;
        
        #pragma omp parallel for private(row) shared(input)
        {
            for(row = i + 1; row < NUM_VALUES; row++)
            {
                float factor = input[INDX_POS(row,i)] / input[INDX_POS(i,i)];
                
                for(int col = i + 1; col < NUM_VALUES; col++)
                {
                    input[INDX_POS(row,col)] = input[INDX_POS(row,col)] - factor * input[INDX_POS(i,col)];
                }
                
                input[INDX_POS(row,i)] = factor;
            }
        }
    }
 
    #pragma omp parallel
    {
        createL(input, l);
        createU(input, u);
    }
}
 
int main(int argc, const char * argv[])
{
    float* test_in = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* test_out = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* l = (float*) malloc(sizeof(float) * NUM_VALUES);
    float* u = (float*) malloc(sizeof(float) * NUM_VALUES);
    
    for (int i = 0; i < NUM_VALUES; i++)
    {
        l[i] = 0;
        u[i] = 0;
        test_out[i] = 0;
        test_in[i] = i + 1;
    }
    
    // i: [index / NUM_VALUES/2] -- j: [index % NUM_VALUES/2]
    //   A         L       U
    // 4   3    1     0   4   3
    // 6   3    1.5   1   0   -1.5
   
    //   A         L       U
    // 3   1    1     0   3   1
    // 4   2    1.3   1   0   0.6
    
    //test_in[0] = 4; // 0 0
    //test_in[1] = 3; // 0 1
    //test_in[2] = 6; // 1 0
    //test_in[3] = 3; // 1 1
    
    //     A            L            U
    // 1   2   3    1   0   0    1   2   3
    // 4   5   6    4   1   0    0  -3  -6
    // 7   8   9    7   2   1    0   0   0
    
    test_in[0] = 80.428;
    test_in[1] = -12.818;
    test_in[2] = -81.284;
    test_in[3] = -95.437;
    test_in[4] = -79.099;
    test_in[5] = 14.754;
    test_in[6] = 80.749;
    test_in[7] = 46.408;
    test_in[8] = -28.549;
    test_in[9] = 76.515;
    test_in[10] = -14.687;
    test_in[11] = -75.622;
    test_in[12] = -88.490;
    test_in[13] = -82.585;
    test_in[14] = 85.373;
    test_in[15] = -61.574;
    
    print("INPUT", test_in);
    
    auto t1 = std::chrono::high_resolution_clock::now();

    LUdecomposition(test_in, l, u);

    auto t2 = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count();
    
    print("L", l);
    print("U", u);
    
    double sum = 0;
    for(int i = 0; i < WIDTH; i++)
    {
        for(int j = 0; j < WIDTH; j++)
        {
            for(int k = 0; k < WIDTH; k++)
            {
                //test_out[i][j] += l[i][k] * u[k][j];
                sum = sum + l[INDX_POS(i,k)] * u[INDX_POS(k,j)];
            }
            
            test_out[INDX_POS(i,j)] = sum;
            sum = 0;
        }
    }
    
    print("OUTPUT", test_out);
    
    if ( validate(test_in, test_out))
    {
        fprintf(stdout, "\nAll values were properly calculated.\n");
    }

    fprintf(stdout, "\n%llu milliseconds\n", duration);
   
    return 0;
}