리눅스데브코스 [9주차 - 3(2)]<ARM 프로세서 코어와 리눅스 커널(2) 실습>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <elf.h>

void elf32 (Elf32_Ehdr *elf_header) {
    printf("ELF file information:\n");
    printf(" Class: %d\n", elf_header->e_ident[EI_CLASS]);
    printf(" Data: %d\n", elf_header->e_ident[EI_DATA]);
    printf(" Version: %d\n", elf_header->e_ident[EI_VERSION]);
    printf(" OS/ABI: %d\n", elf_header->e_ident[EI_OSABI]);
    printf(" Type: %d\n", elf_header->e_type);
    printf(" Machine: %d\n", elf_header->e_machine);
    printf(" Entry point address: %d\n", elf_header->e_entry);
    printf(" Section header offset: %d\n", elf_header->e_shoff);
    printf(" Number of section headers: %d\n", elf_header->e_shnum);
    printf(" Size of section headers: %d\n", elf_header->e_shentsize);
    printf(" Program header offset: %d\n", elf_header->e_phoff);
    printf(" Number of program headers: %d\n", elf_header->e_phnum);
    printf(" Size of program headers: %d\n", elf_header->e_phentsize);
}

void elf64 (Elf64_Ehdr *elf_header) {
    printf("ELF file information:\n");
    printf(" Class: %d\n", elf_header->e_ident[EI_CLASS]);
    printf(" Data: %d\n", elf_header->e_ident[EI_DATA]);
    printf(" Version: %d\n", elf_header->e_ident[EI_VERSION]);
    printf(" OS/ABI: %d\n", elf_header->e_ident[EI_OSABI]);
    printf(" Type: %d\n", elf_header->e_type);
    printf(" Machine: %d\n", elf_header->e_machine);
    printf(" Entry point address: %d\n", elf_header->e_entry);
    printf(" Section header offset: %d\n", elf_header->e_shoff);
    printf(" Number of section headers: %d\n", elf_header->e_shnum);
    printf(" Size of section headers: %d\n", elf_header->e_shentsize);
    printf(" Program header offset: %d\n", elf_header->e_phoff);
    printf(" Number of program headers: %d\n", elf_header->e_phnum);
    printf(" Size of program headers: %d\n", elf_header->e_phentsize);
}

int main(int argc, char *argv[]) {
    FILE *fp = fopen(argv[1], "r");
    Elf32_Ehdr elf32_header;
    Elf64_Ehdr elf64_header;

    if(fp == NULL) {
        printf("Failed to open file\n"); exit(1);
    }

    fread(&elf32_header, sizeof(Elf32_Ehdr), 1, fp);
    if(memcmp(elf32_header.e_ident, ELFMAG, SELFMAG) != 0) {
        printf("Not an ELF file\n");
        exit(1);
    }

    if(elf32_header.e_ident[EI_CLASS] == ELFCLASS32) {
        printf("ELF32 file detected\n");
        elf32(&elf32_header);
    } else if(elf32_header.e_ident[EI_CLASS] == ELFCLASS64) {
        fseek(fp, 0, SEEK_SET);
        fread(&elf64_header, sizeof(Elf64_Ehdr), 1, fp);
        
        printf("ELF64 file detected\n");
        elf64(&elf64_header);
    } else {
        printf("Unknown ELF file class\n");
    }

    fclose(fp);
    return 0;
}

2. 32bit/64bit ELF 파일 생성 및 테스트

왜 안될까..

내가 사용중인 ARM 칩용 리눅스의 문제

해결

2. SIMD 프로그램

실습 1

1. 코드

#include <stdio.h>
#include<stdlib.h>
#include <time.h>
#include <arm_neon.h>

void mat_mul_c (float* dst, float* src1, const float* src2, int count) {
    int i;
    for (i=0; i<count; i++) {
        dst[i] = src1[i]*src2[i];
    }
}

void mat_mul_neon_c (float* dst, float* src1, const float* src2, int count) {
    for (; count; count -=4, src1+=4, src2+=4, dst += 4) {
        float32x4_t in1, in2, out;
        in1 = vld1q_f32(src1);
        in2 = vld1q_f32(src2);
        out = vmulq_f32(in1, in2);
        vst1q_f32(dst, out);
    }
}

void mat_mul_neon_asm (float* dst, float* src1, const float* src2, int count) {
    asm  volatile("1: \n"
        " ld1 {v0.4s}, [%[src1]], #16 \n"
        " ld1 {v1.4s}, [%[src2]], #16 \n"
        " fmul v0.4s, v0.4s, v1.4s \n"
        " subs %[count], %[count], #4 \n"
        " st1 {v0.4s}, [%[dst]], #16 \n"
        " bgt 1b \n"
        : [dst] "+r"(dst)
        : [src1] "r"(src1), [src2] "r"(src2), [count]
        "r"(count)
        : "memory", "v0", "v1"
    );

}

int main(int argc, char *argv[]) {

    if (argc != 2) {
        perror("arc err");
        exit(-1);
    }

    int array_size = atoi(argv[1]);
    struct timespec  begin, end;
    double mat_mul_c_time, mat_mul_neon_c_time, mat_mul_neon_asm_time;
    float a[array_size];
    float b[array_size];
    float c[array_size];

    srand(0);
    for (int i=0; i<array_size; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    clock_gettime(CLOCK_MONOTONIC, &begin);
    mat_mul_c(c, a, b, array_size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    mat_mul_c_time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec) / 1e9;

    clock_gettime(CLOCK_MONOTONIC, &begin);
    mat_mul_neon_c(c, a, b, array_size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    mat_mul_neon_c_time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec) / 1e9;

    clock_gettime(CLOCK_MONOTONIC, &begin);
    mat_mul_neon_asm(c, a, b, array_size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    mat_mul_neon_asm_time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec) / 1e9;

    printf("array_size = %d\n", array_size);
    printf("mat_mul_c_time :        %lf\n", mat_mul_c_time);
    printf("mat_mul_neon_c_time :   %lf\n", mat_mul_neon_c_time);
    printf("mat_mul_neon_asm_time : %lf\n", mat_mul_neon_asm_time);
    

    return 0;
}

2. 결과

실습 2

1. 코드

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <arm_neon.h>

void mat_mul_c (float* dst, float* src1, const float* src2, int count) {
    int i;
    for (i=0; i<count; i++) {
        dst[i] = src1[i]*src2[i];
    }
}

void mat_mul_neon_c (float* dst, float* src1, const float* src2, int count) {
    float32x4_t in1, in2, out;
    for (; count>=4; count-=4, src1+=4, src2+=4, dst += 4) {
        in1 = vld1q_f32(src1);
        in2 = vld1q_f32(src2);
        out = vmulq_f32(in1, in2);
        vst1q_f32(dst, out);
    }
    if (count > 0) {
        float32_t tmp_src1[4], tmp_src2[4], tmp_dst[4];
        memcpy(tmp_src1, src1, count * sizeof(float));
        memcpy(tmp_src2, src2, count * sizeof(float));

        in1 = vld1q_f32(tmp_src1);
        in2 = vld1q_f32(tmp_src2);
        out = vmulq_f32(in1, in2);

        vst1q_f32(tmp_dst, out);
        memcpy(dst, tmp_dst, count * sizeof(float));
    }
}

int main(int argc, char *argv[]) {

    if (argc != 2) {
        perror("arc err");
        exit(-1);
    }

    int array_size = atoi(argv[1]);
    struct timespec  begin, end;
    double mat_mul_c_time, mat_mul_neon_c_time;
    float* a = malloc(array_size * sizeof(float));
    float* b = malloc(array_size * sizeof(float));
    float* c = malloc(array_size * sizeof(float));


    srand(0);
    for (int i=0; i<array_size; i++) {
        a[i] = rand();
        b[i] = rand();
    }

    clock_gettime(CLOCK_MONOTONIC, &begin);
    mat_mul_c(c, a, b, array_size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    mat_mul_c_time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec) / 1e9;

    clock_gettime(CLOCK_MONOTONIC, &begin);
    mat_mul_neon_c(c, a, b, array_size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    mat_mul_neon_c_time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec) / 1e9;

    printf("array_size = %d\n", array_size);
    printf("mat_mul_c_time :        %lf\n", mat_mul_c_time);
    printf("mat_mul_neon_c_time :   %lf\n", mat_mul_neon_c_time);

    free(a);
    free(b);
    free(c);

    return 0;
}