Cxxdgc's Site

Back

要干什么?#

  1. 你需要在handout目录下新建一个csim.c文件,并使用C语言写一个程序模拟高速缓存。
  2. 修改trans.c文件,优化矩阵转置函数,减少缓存不命中的数量。(对三种大小32*32、64*64、60*68分别进行优化)

测试指令#

partA

make
./test-csim
bash

partB

# 下面单个指令是对三种矩阵大小的测试指令
make
./test-trans -M 32 -N 32
./test-trans -M 64 -N 64
./test-trans -M 60 -N 68
bash

partA#

这部分较为简单,主要困难的是输入输出、参数解析等我们并不熟知的C语言函数用法。这里给出除了核心cache部分以外均已实现的代码。

代码框架#

请注意,这里的代码框架仅为了帮助你更好的关注于高速缓存本身,请不要直接抄袭!

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <string.h>
#include "cachelab.h"

// 缓存的每一行
struct cacheline
{
    // TODO:缓存每一行的架构
};
// 缓存的每一组
struct cacheset
{
    // TODO:缓存每一组的架构
};

// 缓存
struct cache
{
    // TODO:缓存架构
};

// 命中数量计算
int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;

// 打印帮助
void print_help()
{
    printf("Usage: ./csim [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
    printf("Options:\n");
    printf("  -h         Print this help message.\n");
    printf("  -v         Optional verbose flag.\n");
    printf("  -s <s>     Number of set index bits.\n");
    printf("  -E <E>     Associativity (number of lines per set).\n");
    printf("  -b <b>     Number of block bits.\n");
    printf("  -t <tracefile>  Name of the valgrind trace to replay.\n");
    return;
}

// 输入统一解析函数
void parse_args(int argc, char *argv[], int *s, int *E, int *b, char **tracefile, int *verbose)
{
    int opt;
    while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1)
    {
        switch (opt)
        {
        case 'h':
            print_help();
            exit(0);
        case 'v':
            *verbose = 1;
            break;
        case 's':
            *s = atoi(optarg);
            break;
        case 'E':
            *E = atoi(optarg);
            break;
        case 'b':
            *b = atoi(optarg);
            break;
        case 't':
            *tracefile = optarg;
            break;
        default:
            print_help();
            exit(1);
        }
    }
}

// 初始化缓存
struct cache *init_cache(int s, int E, int b)
{
    // TODO:建立缓存,为缓存分配内存空间
    // 返回缓存的指针
}

// 清理缓存
void delete_cache(struct cache *cache)
{
    // TODO:删除缓存,回收所有内存空间
}

// 模拟缓存访问
void l_cache(struct cache *cc, unsigned long address, int verbose)
{
    // TODO:访问缓存的具体模拟操作
    // 其中需要包括命中、不命中(驱逐+存入新的),并在相应情况下增加命中(hit_count)、不命中(miss_count)、驱逐(eviction_count)的计数
}

// 模拟缓存运行主线
void do_cache_function(char *file, struct cache *cc, int verbose)
{
    FILE *fp = fopen(file, "r");
    if (!fp)
    {
        fprintf(stderr, "文件读取错误!\n");
        exit(1);
    }
    char op[256];
    // 读入文件指令
    while (fgets(op, sizeof(op), fp))
    {
        if (op[0] == 'I')
            continue;
        char operation;
        unsigned long address;
        int size;
        if (sscanf(op, " %c %lx,%d", &operation, &address, &size) == 3)
        {
            if (verbose)
                printf("%c %lx,%d ", operation, address, size);

            switch (operation)
            {
            // 加载
            case 'L':
                l_cache(cc, address, verbose);
                if (verbose)
                    printf("\n");
                break;
            // 存储
            case 'S':
                l_cache(cc, address, verbose);
                if (verbose)
                    printf("\n");
                break;
            // 修改(加载+储存)
            case 'M':
                l_cache(cc, address, verbose);
                l_cache(cc, address, verbose);
                if (verbose)
                    printf("\n");
                break;
            }
        }
    }

    fclose(fp);
    return;
}

int main(int argc, char *argv[])
{
    int s, E, b, verbose = 0;
    char *tracefile = NULL;
    // 解析输入
    parse_args(argc, argv, &s, &E, &b, &tracefile, &verbose);
    // 初始化缓存
    struct cache *cc = init_cache(s, E, b);
    // 模拟缓存运行
    do_cache_function(tracefile, cc, verbose);

    // 输出结果
    printSummary(hit_count, miss_count, eviction_count);

    // 清理内存
    delete_cache(cc);
    return 0;
}
plaintext

partB#

32*32#

这里使用上课时讲解的矩阵分块技术,将矩阵切分成8*8的块依次进行转置即可通过

64*64#

这里首先尝试沿用之前的8*8分块,会发现miss极高,几乎没有优化。这是因为64*64时若再按8*8分块处理会有冲突,因为缓存总容量为256int,a[0][0]和a[0][4]会映射到同一缓存行,导致冲突不命中

于是很自然的尝试4*4分块处理

for (int i = 0; i < N; i += 4)
{
    for (int j = 0; j < M; j += 4)
    {
        for (int k = i; k < i + 4; k++)
        {
            int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
            B[j][k] = a0;
            B[j + 1][k] = a1;
            B[j + 2][k] = a2;
            B[j + 3][k] = a3;
        }
    }
}
plaintext

此时测得miss为1700。这里由于分块小导致在分块间切换时会造成的miss,从而造成总损耗较大。

于是自然的,为了减小分块带来的损耗,可以将其先分为8*8块,后再在8*8块内进行优化避免内部的冲突不命中。具体方法如下:

  1. 将8*8的块中的前四行先转置,其中左上的4*4转置完仍然在左上4*4位置,右上的4*4转置完到B的右上4*4位置
  2. 将8*8的块中的后四行转置
    1. A左下转置完直接放放在B右上,B右上原有内容扔到左下
    2. 读A右下的4*4块,转到B的右下位置

60*68#

首先自然的尝试使用4*4分块进行处理,发现miss为1685,离满分先1600仅差85。于是基于上面两种方法做一点点优化即可。

完整代码#

!!!注意:ICSlab存在查重,请勿直接抄袭此处的完整代码,否则后果自负!

csim.c#

// 使用c语言模拟高速缓存。使用cacheline、cacheset、cache三种结构体表示缓存的行、组和整体。
// 利用lru进行缓存替换。每次操作会给该组内所有行lru加1,命中则归零,则lru最大者即为最少使用者
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <string.h>
#include "cachelab.h"

// 缓存的每一行
struct cacheline
{
    int valid;
    unsigned long tag;
    int lru;
};
// 缓存的每一组
struct cacheset
{
    struct cacheline **lines;
};

// 缓存
struct cache
{
    int s;
    int E;
    int b;
    int t;
    struct cacheset **sets;
};

// 命中数量计算
int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;

// 打印帮助
void print_help()
{
    printf("Usage: ./csim [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
    printf("Options:\n");
    printf("  -h         Print this help message.\n");
    printf("  -v         Optional verbose flag.\n");
    printf("  -s <s>     Number of set index bits.\n");
    printf("  -E <E>     Associativity (number of lines per set).\n");
    printf("  -b <b>     Number of block bits.\n");
    printf("  -t <tracefile>  Name of the valgrind trace to replay.\n");
    return;
}

// 输入统一解析函数
void parse_args(int argc, char *argv[], int *s, int *E, int *b, char **tracefile, int *verbose)
{
    int opt;
    while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1)
    {
        switch (opt)
        {
        case 'h':
            print_help();
            exit(0);
        case 'v':
            *verbose = 1;
            break;
        case 's':
            *s = atoi(optarg);
            break;
        case 'E':
            *E = atoi(optarg);
            break;
        case 'b':
            *b = atoi(optarg);
            break;
        case 't':
            *tracefile = optarg;
            break;
        default:
            print_help();
            exit(1);
        }
    }
}
// 初始化缓存
struct cache *init_cache(int s, int E, int b)
{
    struct cache *cc = malloc(sizeof(struct cache));
    cc->s = s;
    cc->E = E;
    cc->b = b;
    cc->t = 64 - s - b;
    int S = 1 << s;
    cc->sets = malloc(S * sizeof(struct cacheset *));
    for (int i = 0; i < S; i++)
    {
        // 初始化每一组
        cc->sets[i] = malloc(sizeof(struct cacheset));
        cc->sets[i]->lines = malloc(E * sizeof(struct cacheline *));
        // 初始化组中的每一行
        for (int j = 0; j < E; j++)
        {
            cc->sets[i]->lines[j] = malloc(sizeof(struct cacheline));
            cc->sets[i]->lines[j]->valid = 0;
            cc->sets[i]->lines[j]->tag = 0;
            cc->sets[i]->lines[j]->lru = 0;
        }
    }
    return cc;
}

// 清理缓存
void delete_cache(struct cache *cache)
{
    // 计算组数
    int S = 1 << cache->s;
    // 逐行
    for (int i = 0; i < S; i++)
    {
        for (int j = 0; j < cache->E; j++)
        {
            free(cache->sets[i]->lines[j]);
        }
        free(cache->sets[i]->lines);
        free(cache->sets[i]);
    }
    free(cache->sets);
    free(cache);
}

// 模拟缓存访问
void l_cache(struct cache *cc, unsigned long address, int verbose)
{
    // 切分出标记、组索引
    unsigned long mask = (1UL << (cc->s)) - 1;
    unsigned long tag = (address >> (cc->b + cc->s));
    int index = (address >> (cc->b)) & mask;

    // 找到对应组
    struct cacheset *set = cc->sets[index];

    // 查找是否命中
    int hit = 0, empty_line = -1, lru_line = 0, max_lru = -1;
    int E = cc->E;
    for (int i = 0; i < E; i++)
    {
        // 时间增加
        set->lines[i]->lru++;

        // 检查是否命中(有效+标记相同)
        if (set->lines[i]->valid && (set->lines[i]->tag == tag))
        {
            // 命中
            hit = 1;
            // 命中的缓存行lru归零
            set->lines[i]->lru = 0;
            if (verbose)
                printf("hit ");
        }
        // 检查记录空行
        if (!set->lines[i]->valid)
        {
            empty_line = i;
        }
        // 检查记录LRU最大的行
        if (set->lines[i]->lru > max_lru)
        {
            max_lru = set->lines[i]->lru;
            lru_line = i;
        }
    }
    if (hit)
    {
        hit_count++;
        return;
    }
    // 未命中
    miss_count++;
    if (verbose)
    {
        printf("miss ");
    }
    if (empty_line != -1)
    {
        // 缓存未满,直接存
        set->lines[empty_line]->valid = 1;
        set->lines[empty_line]->tag = tag;
        set->lines[empty_line]->lru = 0;
    }
    else
    {
        // 缓存已满,驱逐lru最大的,即最久未访问的
        eviction_count++;
        if (verbose)
            printf("eviction ");

        set->lines[lru_line]->tag = tag;
        set->lines[lru_line]->lru = 0;
    }
    return;
}

// 模拟缓存运行主线
void do_cache_function(char *file, struct cache *cc, int verbose)
{
    FILE *fp = fopen(file, "r");
    if (!fp)
    {
        fprintf(stderr, "文件读取错误!\n");
        exit(1);
    }
    char op[256];
    // 读入文件指令
    while (fgets(op, sizeof(op), fp))
    {
        if (op[0] == 'I')
            continue;
        char operation;
        unsigned long address;
        int size;
        if (sscanf(op, " %c %lx,%d", &operation, &address, &size) == 3)
        {
            if (verbose)
                printf("%c %lx,%d ", operation, address, size);

            switch (operation)
            {
            // 加载
            case 'L':
                l_cache(cc, address, verbose);
                if (verbose)
                    printf("\n");
                break;
            // 存储
            case 'S':
                l_cache(cc, address, verbose);
                if (verbose)
                    printf("\n");
                break;
            // 修改(加载+储存)
            case 'M':
                l_cache(cc, address, verbose);
                l_cache(cc, address, verbose);
                if (verbose)
                    printf("\n");
                break;
            }
        }
    }

    fclose(fp);
    return;
}

int main(int argc, char *argv[])
{
    int s, E, b, verbose = 0;
    char *tracefile = NULL;
    // 解析输入
    parse_args(argc, argv, &s, &E, &b, &tracefile, &verbose);
    // 初始化缓存
    struct cache *cc = init_cache(s, E, b);
    // 模拟缓存运行
    do_cache_function(tracefile, cc, verbose);

    // 输出结果
    printSummary(hit_count, miss_count, eviction_count);

    // 清理内存
    delete_cache(cc);
    return 0;
}
plaintext

tarns.c#

#include <stdio.h>
#include "cachelab.h"
#include "contracts.h"

int is_transpose(int M, int N, int A[N][M], int B[M][N]);


char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{

    REQUIRES(M > 0);
    REQUIRES(N > 0);
    if (M == 32 && N == 32)
    {
        // 32*32的处理
        // 使用矩阵分块的,分成8*8的块进行处理。
        for (int i = 0; i < N; i += 8)
        {
            for (int j = 0; j < M; j += 8)
            {
                for (int k = i; k < i + 8; k++)
                {
                    int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
                    int a4 = A[k][j + 4], a5 = A[k][j + 5], a6 = A[k][j + 6], a7 = A[k][j + 7];
                    B[j][k] = a0;
                    B[j + 1][k] = a1;
                    B[j + 2][k] = a2;
                    B[j + 3][k] = a3;
                    B[j + 4][k] = a4;
                    B[j + 5][k] = a5;
                    B[j + 6][k] = a6;
                    B[j + 7][k] = a7;
                }
            }
        }
    }
    else if (M == 64 && N == 64)
    {
        // miss为1180
        if (M == 64)
        {
            for (int i = 0; i < N; i += 8)
            {
                for (int j = 0; j < M; j += 8)
                {
                    // 将8*8的块中的前四行先转置
                    // 其中左上的4*4转置完仍然在左上4*4位置,右上的4*4转置完到B的右上4*4位置
                    // 矩阵在c++中行优先,此处索引与坐标系相反
                    for (int k = 0; k < 4; k++)
                    {
                        int row = i + k;

                        int a0 = A[row][j];
                        int a1 = A[row][j + 1];
                        int a2 = A[row][j + 2];
                        int a3 = A[row][j + 3];
                        int a4 = A[row][j + 4];
                        int a5 = A[row][j + 5];
                        int a6 = A[row][j + 6];
                        int a7 = A[row][j + 7];

                        B[j][row] = a0;
                        B[j + 1][row] = a1;
                        B[j + 2][row] = a2;
                        B[j + 3][row] = a3;
                        // B[j + 4][row] = a4;
                        // B[j + 5][row] = a5;
                        // B[j + 6][row] = a6;
                        // B[j + 7][row] = a7;
                        B[j][row + 4] = a4;
                        B[j + 1][row + 4] = a5;
                        B[j + 2][row + 4] = a6;
                        B[j + 3][row + 4] = a7;
                    }
                    // 将8*8的块中的后四行转置
                    // A左下转置完直接放放在B右上,B右上原有内容扔到左下
                    for (int k = 0; k < 4; k++)
                    {
                        // 读A左上内容
                        int col = j + k;
                        int a0 = A[i + 4][col];
                        int a1 = A[i + 5][col];
                        int a2 = A[i + 6][col];
                        int a3 = A[i + 7][col];
                        // 读B右上内容
                        int a4 = B[col][i + 4];
                        int a5 = B[col][i + 5];
                        int a6 = B[col][i + 6];
                        int a7 = B[col][i + 7];
                        // B[col + 4][i] = a4;
                        // B[col + 4][i + 1] = a5;
                        // B[col + 4][i + 2] = a6;
                        // B[col + 4][i + 3] = a7;

                        // A左下扔B右上
                        B[col][i + 4] = a0;
                        B[col][i + 5] = a1;
                        B[col][i + 6] = a2;
                        B[col][i + 7] = a3;
                        // B右上扔B左下
                        B[col + 4][i] = a4;
                        B[col + 4][i + 1] = a5;
                        B[col + 4][i + 2] = a6;
                        B[col + 4][i + 3] = a7;
                    }

                    // 读A右下的4*4块,转到B的右下位置
                    for (int k = 4; k < 8; k++)
                    {
                        int row = i + k;

                        int a0 = A[row][j + 4];
                        int a1 = A[row][j + 5];
                        int a2 = A[row][j + 6];
                        int a3 = A[row][j + 7];

                        B[j + 4][row] = a0;
                        B[j + 5][row] = a1;
                        B[j + 6][row] = a2;
                        B[j + 7][row] = a3;
                    }
                }
            }
        }
    }
    else if (M == 60 && N == 68)
    {
      for (int i = 0; i < 64; i += 8)
        {
            for (int j = 0; j < 56; j += 8)
            {
                for (int k = i; k < i + 8; k++)
                {
                    int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
                    int a4 = A[k][j + 4], a5 = A[k][j + 5], a6 = A[k][j + 6], a7 = A[k][j + 7];
                    B[j + k - i][i] = a0;
                    B[j + k - i][i + 1] = a1;
                    B[j + k - i][i + 2] = a2;
                    B[j + k - i][i + 3] = a3;
                    B[j + k - i][i + 4] = a4;
                    B[j + k - i][i + 5] = a5;
                    B[j + k - i][i + 6] = a6;
                    B[j + k - i][i + 7] = a7;
                }
                for (int k = 0; k < 8; k++)
                {
                    for (int t = 0; t < k; t++)
                    {
                        int a0 = B[j + k][i + t];
                        B[j + k][i + t] = B[j + t][i + k];
                        B[j + t][i + k] = a0;
                    }
                }
            }
        }
        for (int i = 0; i < N; i += 4)
        {
            for (int j = 56; j < M; j += 4)
            {
                for (int k = i; k < i + 4; k++)
                {
                    int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
                    B[j][k] = a0;
                    B[j + 1][k] = a1;
                    B[j + 2][k] = a2;
                    B[j + 3][k] = a3;
                }
            }
        }
        for (int i = 64; i < N; i += 4)
        {
            for (int j = 0; j < 56; j += 4)
            {
                for (int k = i; k < i + 4; k++)
                {
                    int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
                    B[j][k] = a0;
                    B[j + 1][k] = a1;
                    B[j + 2][k] = a2;
                    B[j + 3][k] = a3;
                }
            }
        }
    }
    else
    {
        int i, j, tmp;
        for (i = 0; i < N; i++)
        {
            for (j = 0; j < M; j++)
            {
                tmp = A[i][j];
                B[j][i] = tmp;
            }
        }
    }
    ENSURES(is_transpose(M, N, A, B));
}

char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, tmp;

    REQUIRES(M > 0);
    REQUIRES(N > 0);

    for (i = 0; i < N; i++)
    {
        for (j = 0; j < M; j++)
        {
            tmp = A[i][j];
            B[j][i] = tmp;
        }
    }

    ENSURES(is_transpose(M, N, A, B));
}

void registerFunctions()
{
   
    registerTransFunction(transpose_submit, transpose_submit_desc);

    registerTransFunction(trans, trans_desc);
}


int is_transpose(int M, int N, int A[N][M], int B[M][N])
{
    int i, j;

    for (i = 0; i < N; i++)
    {
        for (j = 0; j < M; ++j)
        {
            if (A[i][j] != B[j][i])
            {
                return 0;
            }
        }
    }
    return 1;
}
plaintext
2025秋ICS CacheLab
https://www.cxxdgc.cn/blog/courseblog/2025autumn/ics/cachelab
Author Cxxdgc
Published at 2025年11月17日
Comment seems to stuck. Try to refresh?✨