要干什么?#
- 你需要在handout目录下新建一个csim.c文件,并使用C语言写一个程序模拟高速缓存。
- 修改trans.c文件,优化矩阵转置函数,减少缓存不命中的数量。(对三种大小32*32、64*64、60*68分别进行优化)
测试指令#
partA
make
./test-csimbashpartB
# 下面单个指令是对三种矩阵大小的测试指令
make
./test-trans -M 32 -N 32
./test-trans -M 64 -N 64
./test-trans -M 60 -N 68bashpartA#
这部分较为简单,主要困难的是输入输出、参数解析等我们并不熟知的C语言函数用法。这里给出除了核心cache部分以外均已实现的代码。
代码框架#
请注意,这里的代码框架仅为了帮助你更好的关注于高速缓存本身,请不要直接抄袭!
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <string.h>
#include "cachelab.h"
// 缓存的每一行
struct cacheline
{
// TODO:缓存每一行的架构
};
// 缓存的每一组
struct cacheset
{
// TODO:缓存每一组的架构
};
// 缓存
struct cache
{
// TODO:缓存架构
};
// 命中数量计算
int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;
// 打印帮助
void print_help()
{
printf("Usage: ./csim [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
printf("Options:\n");
printf(" -h Print this help message.\n");
printf(" -v Optional verbose flag.\n");
printf(" -s <s> Number of set index bits.\n");
printf(" -E <E> Associativity (number of lines per set).\n");
printf(" -b <b> Number of block bits.\n");
printf(" -t <tracefile> Name of the valgrind trace to replay.\n");
return;
}
// 输入统一解析函数
void parse_args(int argc, char *argv[], int *s, int *E, int *b, char **tracefile, int *verbose)
{
int opt;
while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1)
{
switch (opt)
{
case 'h':
print_help();
exit(0);
case 'v':
*verbose = 1;
break;
case 's':
*s = atoi(optarg);
break;
case 'E':
*E = atoi(optarg);
break;
case 'b':
*b = atoi(optarg);
break;
case 't':
*tracefile = optarg;
break;
default:
print_help();
exit(1);
}
}
}
// 初始化缓存
struct cache *init_cache(int s, int E, int b)
{
// TODO:建立缓存,为缓存分配内存空间
// 返回缓存的指针
}
// 清理缓存
void delete_cache(struct cache *cache)
{
// TODO:删除缓存,回收所有内存空间
}
// 模拟缓存访问
void l_cache(struct cache *cc, unsigned long address, int verbose)
{
// TODO:访问缓存的具体模拟操作
// 其中需要包括命中、不命中(驱逐+存入新的),并在相应情况下增加命中(hit_count)、不命中(miss_count)、驱逐(eviction_count)的计数
}
// 模拟缓存运行主线
void do_cache_function(char *file, struct cache *cc, int verbose)
{
FILE *fp = fopen(file, "r");
if (!fp)
{
fprintf(stderr, "文件读取错误!\n");
exit(1);
}
char op[256];
// 读入文件指令
while (fgets(op, sizeof(op), fp))
{
if (op[0] == 'I')
continue;
char operation;
unsigned long address;
int size;
if (sscanf(op, " %c %lx,%d", &operation, &address, &size) == 3)
{
if (verbose)
printf("%c %lx,%d ", operation, address, size);
switch (operation)
{
// 加载
case 'L':
l_cache(cc, address, verbose);
if (verbose)
printf("\n");
break;
// 存储
case 'S':
l_cache(cc, address, verbose);
if (verbose)
printf("\n");
break;
// 修改(加载+储存)
case 'M':
l_cache(cc, address, verbose);
l_cache(cc, address, verbose);
if (verbose)
printf("\n");
break;
}
}
}
fclose(fp);
return;
}
int main(int argc, char *argv[])
{
int s, E, b, verbose = 0;
char *tracefile = NULL;
// 解析输入
parse_args(argc, argv, &s, &E, &b, &tracefile, &verbose);
// 初始化缓存
struct cache *cc = init_cache(s, E, b);
// 模拟缓存运行
do_cache_function(tracefile, cc, verbose);
// 输出结果
printSummary(hit_count, miss_count, eviction_count);
// 清理内存
delete_cache(cc);
return 0;
}plaintextpartB#
32*32#
这里使用上课时讲解的矩阵分块技术,将矩阵切分成8*8的块依次进行转置即可通过
64*64#
这里首先尝试沿用之前的8*8分块,会发现miss极高,几乎没有优化。这是因为64*64时若再按8*8分块处理会有冲突,因为缓存总容量为256int,a[0][0]和a[0][4]会映射到同一缓存行,导致冲突不命中
于是很自然的尝试4*4分块处理
for (int i = 0; i < N; i += 4)
{
for (int j = 0; j < M; j += 4)
{
for (int k = i; k < i + 4; k++)
{
int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
B[j][k] = a0;
B[j + 1][k] = a1;
B[j + 2][k] = a2;
B[j + 3][k] = a3;
}
}
}plaintext此时测得miss为1700。这里由于分块小导致在分块间切换时会造成的miss,从而造成总损耗较大。
于是自然的,为了减小分块带来的损耗,可以将其先分为8*8块,后再在8*8块内进行优化避免内部的冲突不命中。具体方法如下:
- 将8*8的块中的前四行先转置,其中左上的4*4转置完仍然在左上4*4位置,右上的4*4转置完到B的右上4*4位置
- 将8*8的块中的后四行转置
- A左下转置完直接放放在B右上,B右上原有内容扔到左下
- 读A右下的4*4块,转到B的右下位置
60*68#
首先自然的尝试使用4*4分块进行处理,发现miss为1685,离满分先1600仅差85。于是基于上面两种方法做一点点优化即可。
完整代码#
!!!注意:ICSlab存在查重,请勿直接抄袭此处的完整代码,否则后果自负!
csim.c#
// 使用c语言模拟高速缓存。使用cacheline、cacheset、cache三种结构体表示缓存的行、组和整体。
// 利用lru进行缓存替换。每次操作会给该组内所有行lru加1,命中则归零,则lru最大者即为最少使用者
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <string.h>
#include "cachelab.h"
// 缓存的每一行
struct cacheline
{
int valid;
unsigned long tag;
int lru;
};
// 缓存的每一组
struct cacheset
{
struct cacheline **lines;
};
// 缓存
struct cache
{
int s;
int E;
int b;
int t;
struct cacheset **sets;
};
// 命中数量计算
int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;
// 打印帮助
void print_help()
{
printf("Usage: ./csim [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
printf("Options:\n");
printf(" -h Print this help message.\n");
printf(" -v Optional verbose flag.\n");
printf(" -s <s> Number of set index bits.\n");
printf(" -E <E> Associativity (number of lines per set).\n");
printf(" -b <b> Number of block bits.\n");
printf(" -t <tracefile> Name of the valgrind trace to replay.\n");
return;
}
// 输入统一解析函数
void parse_args(int argc, char *argv[], int *s, int *E, int *b, char **tracefile, int *verbose)
{
int opt;
while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1)
{
switch (opt)
{
case 'h':
print_help();
exit(0);
case 'v':
*verbose = 1;
break;
case 's':
*s = atoi(optarg);
break;
case 'E':
*E = atoi(optarg);
break;
case 'b':
*b = atoi(optarg);
break;
case 't':
*tracefile = optarg;
break;
default:
print_help();
exit(1);
}
}
}
// 初始化缓存
struct cache *init_cache(int s, int E, int b)
{
struct cache *cc = malloc(sizeof(struct cache));
cc->s = s;
cc->E = E;
cc->b = b;
cc->t = 64 - s - b;
int S = 1 << s;
cc->sets = malloc(S * sizeof(struct cacheset *));
for (int i = 0; i < S; i++)
{
// 初始化每一组
cc->sets[i] = malloc(sizeof(struct cacheset));
cc->sets[i]->lines = malloc(E * sizeof(struct cacheline *));
// 初始化组中的每一行
for (int j = 0; j < E; j++)
{
cc->sets[i]->lines[j] = malloc(sizeof(struct cacheline));
cc->sets[i]->lines[j]->valid = 0;
cc->sets[i]->lines[j]->tag = 0;
cc->sets[i]->lines[j]->lru = 0;
}
}
return cc;
}
// 清理缓存
void delete_cache(struct cache *cache)
{
// 计算组数
int S = 1 << cache->s;
// 逐行
for (int i = 0; i < S; i++)
{
for (int j = 0; j < cache->E; j++)
{
free(cache->sets[i]->lines[j]);
}
free(cache->sets[i]->lines);
free(cache->sets[i]);
}
free(cache->sets);
free(cache);
}
// 模拟缓存访问
void l_cache(struct cache *cc, unsigned long address, int verbose)
{
// 切分出标记、组索引
unsigned long mask = (1UL << (cc->s)) - 1;
unsigned long tag = (address >> (cc->b + cc->s));
int index = (address >> (cc->b)) & mask;
// 找到对应组
struct cacheset *set = cc->sets[index];
// 查找是否命中
int hit = 0, empty_line = -1, lru_line = 0, max_lru = -1;
int E = cc->E;
for (int i = 0; i < E; i++)
{
// 时间增加
set->lines[i]->lru++;
// 检查是否命中(有效+标记相同)
if (set->lines[i]->valid && (set->lines[i]->tag == tag))
{
// 命中
hit = 1;
// 命中的缓存行lru归零
set->lines[i]->lru = 0;
if (verbose)
printf("hit ");
}
// 检查记录空行
if (!set->lines[i]->valid)
{
empty_line = i;
}
// 检查记录LRU最大的行
if (set->lines[i]->lru > max_lru)
{
max_lru = set->lines[i]->lru;
lru_line = i;
}
}
if (hit)
{
hit_count++;
return;
}
// 未命中
miss_count++;
if (verbose)
{
printf("miss ");
}
if (empty_line != -1)
{
// 缓存未满,直接存
set->lines[empty_line]->valid = 1;
set->lines[empty_line]->tag = tag;
set->lines[empty_line]->lru = 0;
}
else
{
// 缓存已满,驱逐lru最大的,即最久未访问的
eviction_count++;
if (verbose)
printf("eviction ");
set->lines[lru_line]->tag = tag;
set->lines[lru_line]->lru = 0;
}
return;
}
// 模拟缓存运行主线
void do_cache_function(char *file, struct cache *cc, int verbose)
{
FILE *fp = fopen(file, "r");
if (!fp)
{
fprintf(stderr, "文件读取错误!\n");
exit(1);
}
char op[256];
// 读入文件指令
while (fgets(op, sizeof(op), fp))
{
if (op[0] == 'I')
continue;
char operation;
unsigned long address;
int size;
if (sscanf(op, " %c %lx,%d", &operation, &address, &size) == 3)
{
if (verbose)
printf("%c %lx,%d ", operation, address, size);
switch (operation)
{
// 加载
case 'L':
l_cache(cc, address, verbose);
if (verbose)
printf("\n");
break;
// 存储
case 'S':
l_cache(cc, address, verbose);
if (verbose)
printf("\n");
break;
// 修改(加载+储存)
case 'M':
l_cache(cc, address, verbose);
l_cache(cc, address, verbose);
if (verbose)
printf("\n");
break;
}
}
}
fclose(fp);
return;
}
int main(int argc, char *argv[])
{
int s, E, b, verbose = 0;
char *tracefile = NULL;
// 解析输入
parse_args(argc, argv, &s, &E, &b, &tracefile, &verbose);
// 初始化缓存
struct cache *cc = init_cache(s, E, b);
// 模拟缓存运行
do_cache_function(tracefile, cc, verbose);
// 输出结果
printSummary(hit_count, miss_count, eviction_count);
// 清理内存
delete_cache(cc);
return 0;
}plaintexttarns.c#
#include <stdio.h>
#include "cachelab.h"
#include "contracts.h"
int is_transpose(int M, int N, int A[N][M], int B[M][N]);
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
REQUIRES(M > 0);
REQUIRES(N > 0);
if (M == 32 && N == 32)
{
// 32*32的处理
// 使用矩阵分块的,分成8*8的块进行处理。
for (int i = 0; i < N; i += 8)
{
for (int j = 0; j < M; j += 8)
{
for (int k = i; k < i + 8; k++)
{
int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
int a4 = A[k][j + 4], a5 = A[k][j + 5], a6 = A[k][j + 6], a7 = A[k][j + 7];
B[j][k] = a0;
B[j + 1][k] = a1;
B[j + 2][k] = a2;
B[j + 3][k] = a3;
B[j + 4][k] = a4;
B[j + 5][k] = a5;
B[j + 6][k] = a6;
B[j + 7][k] = a7;
}
}
}
}
else if (M == 64 && N == 64)
{
// miss为1180
if (M == 64)
{
for (int i = 0; i < N; i += 8)
{
for (int j = 0; j < M; j += 8)
{
// 将8*8的块中的前四行先转置
// 其中左上的4*4转置完仍然在左上4*4位置,右上的4*4转置完到B的右上4*4位置
// 矩阵在c++中行优先,此处索引与坐标系相反
for (int k = 0; k < 4; k++)
{
int row = i + k;
int a0 = A[row][j];
int a1 = A[row][j + 1];
int a2 = A[row][j + 2];
int a3 = A[row][j + 3];
int a4 = A[row][j + 4];
int a5 = A[row][j + 5];
int a6 = A[row][j + 6];
int a7 = A[row][j + 7];
B[j][row] = a0;
B[j + 1][row] = a1;
B[j + 2][row] = a2;
B[j + 3][row] = a3;
// B[j + 4][row] = a4;
// B[j + 5][row] = a5;
// B[j + 6][row] = a6;
// B[j + 7][row] = a7;
B[j][row + 4] = a4;
B[j + 1][row + 4] = a5;
B[j + 2][row + 4] = a6;
B[j + 3][row + 4] = a7;
}
// 将8*8的块中的后四行转置
// A左下转置完直接放放在B右上,B右上原有内容扔到左下
for (int k = 0; k < 4; k++)
{
// 读A左上内容
int col = j + k;
int a0 = A[i + 4][col];
int a1 = A[i + 5][col];
int a2 = A[i + 6][col];
int a3 = A[i + 7][col];
// 读B右上内容
int a4 = B[col][i + 4];
int a5 = B[col][i + 5];
int a6 = B[col][i + 6];
int a7 = B[col][i + 7];
// B[col + 4][i] = a4;
// B[col + 4][i + 1] = a5;
// B[col + 4][i + 2] = a6;
// B[col + 4][i + 3] = a7;
// A左下扔B右上
B[col][i + 4] = a0;
B[col][i + 5] = a1;
B[col][i + 6] = a2;
B[col][i + 7] = a3;
// B右上扔B左下
B[col + 4][i] = a4;
B[col + 4][i + 1] = a5;
B[col + 4][i + 2] = a6;
B[col + 4][i + 3] = a7;
}
// 读A右下的4*4块,转到B的右下位置
for (int k = 4; k < 8; k++)
{
int row = i + k;
int a0 = A[row][j + 4];
int a1 = A[row][j + 5];
int a2 = A[row][j + 6];
int a3 = A[row][j + 7];
B[j + 4][row] = a0;
B[j + 5][row] = a1;
B[j + 6][row] = a2;
B[j + 7][row] = a3;
}
}
}
}
}
else if (M == 60 && N == 68)
{
for (int i = 0; i < 64; i += 8)
{
for (int j = 0; j < 56; j += 8)
{
for (int k = i; k < i + 8; k++)
{
int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
int a4 = A[k][j + 4], a5 = A[k][j + 5], a6 = A[k][j + 6], a7 = A[k][j + 7];
B[j + k - i][i] = a0;
B[j + k - i][i + 1] = a1;
B[j + k - i][i + 2] = a2;
B[j + k - i][i + 3] = a3;
B[j + k - i][i + 4] = a4;
B[j + k - i][i + 5] = a5;
B[j + k - i][i + 6] = a6;
B[j + k - i][i + 7] = a7;
}
for (int k = 0; k < 8; k++)
{
for (int t = 0; t < k; t++)
{
int a0 = B[j + k][i + t];
B[j + k][i + t] = B[j + t][i + k];
B[j + t][i + k] = a0;
}
}
}
}
for (int i = 0; i < N; i += 4)
{
for (int j = 56; j < M; j += 4)
{
for (int k = i; k < i + 4; k++)
{
int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
B[j][k] = a0;
B[j + 1][k] = a1;
B[j + 2][k] = a2;
B[j + 3][k] = a3;
}
}
}
for (int i = 64; i < N; i += 4)
{
for (int j = 0; j < 56; j += 4)
{
for (int k = i; k < i + 4; k++)
{
int a0 = A[k][j], a1 = A[k][j + 1], a2 = A[k][j + 2], a3 = A[k][j + 3];
B[j][k] = a0;
B[j + 1][k] = a1;
B[j + 2][k] = a2;
B[j + 3][k] = a3;
}
}
}
}
else
{
int i, j, tmp;
for (i = 0; i < N; i++)
{
for (j = 0; j < M; j++)
{
tmp = A[i][j];
B[j][i] = tmp;
}
}
}
ENSURES(is_transpose(M, N, A, B));
}
char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
int i, j, tmp;
REQUIRES(M > 0);
REQUIRES(N > 0);
for (i = 0; i < N; i++)
{
for (j = 0; j < M; j++)
{
tmp = A[i][j];
B[j][i] = tmp;
}
}
ENSURES(is_transpose(M, N, A, B));
}
void registerFunctions()
{
registerTransFunction(transpose_submit, transpose_submit_desc);
registerTransFunction(trans, trans_desc);
}
int is_transpose(int M, int N, int A[N][M], int B[M][N])
{
int i, j;
for (i = 0; i < N; i++)
{
for (j = 0; j < M; ++j)
{
if (A[i][j] != B[j][i])
{
return 0;
}
}
}
return 1;
}
plaintext