// means roughly we can monitor the time duration with up to 200 cycles * SAMPLE_SIZE #define SAMPLE_SIZE 0x40000 unsigned short* a; #define HTT 0x10000000 int main() { int i, j, k; unsigned int sample_size_in_byte; unsigned int cpu_eax, cpu_ebx, cpu_ecx, cpu_edx; sample_size_in_byte = SAMPLE_SIZE * sizeof(short); a = (unsigned short *)malloc(sample_size_in_byte); for (i=0; i< SAMPLE_SIZE; i++) a[i] = 0; asm volatile ( "cpuid" : "=a"(cpu_eax), "=b"(cpu_ebx), "=c"(cpu_ecx), "=d"(cpu_edx) : "a"(0x80000000) ); printf("\nGot CPU info: EAX %x, EBX %x, ECX %x, EDX %x\n", cpu_eax, cpu_ebx, cpu_ecx, cpu_edx); asm volatile ( "cpuid" : "=a"(cpu_eax), "=b"(cpu_ebx), "=c"(cpu_ecx), "=d"(cpu_edx) : "a"(2) ); printf("\nGot cache info: EAX %x, EBX %x, ECX %x, EDX %x\n", cpu_eax, cpu_ebx, cpu_ecx, cpu_edx); return 0; /* if (fork()==0) { execl("/usr/bin/openssl", "rsautl", "-in", "zzz_test", "-out","zzz_out_z", "-inkey", "priv_10k.key", "-sign", (char *)0 ); exit(0); } */ asm volatile ( " movl %0, %%ecx; movl %1, %%ebx; subl $0x4000, %%ebx; //L1 cache size one way size should be 0x800 rdtsc; mov %%eax, %%esi; xor %%edi, %%edi; loop: prefetcht2 0x4800(%%ecx, %%edi, 1); add 0x0000(%%ecx, %%edi, 1), %%cx; //imul $1, %%ecx; add 0x0800(%%ecx, %%edi, 1), %%cx; //imul $1, %%ecx; add 0x1000(%%ecx, %%edi, 1), %%cx; //imul $1, %%ecx; add 0x1800(%%ecx, %%edi, 1), %%cx; //imul $1, %%ecx; add 0x2000(%%ecx, %%edi, 1), %%cx; imul $1, %%ecx; add 0x2800(%%ecx, %%edi, 1), %%cx; imul $1, %%ecx; add 0x3000(%%ecx, %%edi, 1), %%cx; imul $1, %%ecx; add 0x3800(%%ecx, %%edi, 1), %%cx; imul $1, %%ecx; rdtsc; sub %%esi, %%eax; mov %%ax, (%%ecx, %%edi, 1); add %%eax, %%esi; imul $1, %%ecx; add $0x40, %%edi; //L1 cache line size (64 bytes) test $0x7c0, %%edi; //L1 cache set size (2^ (bits in 0x7C0)) the lower 0s are determinted by L1 cache line size jnz loop; //32 sets sub $0x7FE, %%edi; //start from another timing measurement(2 bytes) in the cache line test $0x3E, %%edi; //one cache line can hold up to (L1 cache line size)/(2 bytes each timing measurement) jnz loop; //32 * 32 sets add $0x7C0, %%edi; //one-way data cache size (time to move to another timing measurement way) sub $0x800, %%ebx; //each way of cache can contain 2k bytes jge loop;" : : "m"(&(a[0])), "m"(sample_size_in_byte) // /* the starting address of sample array and the sample size */ : "eax", "ebx", "ecx", "edx", "esi", "edi", "memory" ); for (i=0; i<(sample_size_in_byte/2048); i++) for (j=0; j<32; j++) { printf("\n"); for (k=0; k<32; k++) printf("%u\t", a[i*1024 + j + 32*k]); } }