=Oprofile into debian #install oprofile into debian > apt-get install oprofile #do not get kernel profile. > opcontrol --no-vmlinux >vi array.c #include #include #define ROWS 1000 #define COLS 1000 #define table_ref(table,row,col) ((table)[(row)*COLS+(col)]) typedef int* Table; void touch_row_col(Table from, Table to) { int i, j; for(i=0;i gcc -g array.c > opcontrol --reset > opcontrol --start && ./a.out ; opcontrol --stop #view of result > opreport CPU: AMD64 processors, speed 1795.54 MHz (estimated) Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000 CPU_CLK_UNHALT...| samples| %| ------------------ 1810 49.2249 no-vmlinux 1344 36.5515 a.out 213 5.7928 bash 181 4.9225 libc-2.3.6.so 107 2.9100 ld-2.3.6.so 16 0.4351 oprofiled 2 0.0544 grep 2 0.0544 libpthread-2.3.6.so 1 0.0272 mawk 1 0.0272 tr # specify the some program. > opreport a.out /bin/grep CPU: AMD64 processors, speed 1795.54 MHz (estimated) Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000 CPU_CLK_UNHALT...| samples| %| ------------------ 1344 99.8514 a.out 2 0.1486 grep #-l is for symbol information > opreport -l a.out CPU: AMD64 processors, speed 1795.54 MHz (estimated) Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000 samples % symbol name 1157 86.0863 touch_col_row 187 13.9137 touch_row_col #-c is for call graph > opreport -c a.out CPU: AMD64 processors, speed 1795.54 MHz (estimated) Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000 samples % symbol name ------------------------------------------------------------------------------- 1157 86.0863 touch_col_row 1157 100.000 touch_col_row [self] ------------------------------------------------------------------------------- 187 13.9137 touch_row_col 187 100.000 touch_row_col [self] ------------------------------------------------------------------------------- #-s is for looking source code > opannotate -s a.out /* * Command line: opannotate -s a.out * * Interpretation of command line: * Output annotated source file with samples * Output all files * * CPU: AMD64 processors, speed 1795.54 MHz (estimated) * Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000 */ /* * Total samples for file : "/root/misc/array.c" * * 1344 100.000 */ :#include :#include :#define ROWS 1000 :#define COLS 1000 :#define table_ref(table,row,col) ((table)[(row)*COLS+(col)]) :typedef int* Table; :void touch_row_col(Table from, Table to) { /* touch_row_col total: 187 13.9137 */ : int i, j; : for(i=0;i opannotate -a a.out /* * Command line: opannotate -a a.out * * Interpretation of command line: * Output annotated assembly listing with samples * * CPU: AMD64 processors, speed 1795.54 MHz (estimated) * Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000 */ : :/root/misc/a.out: file format elf64-x86-64 : :Disassembly of section .init: :Disassembly of section .plt: :Disassembly of section .text: : 00000000004004e3 : /* touch_col_row total: 1157 86.0863 */ : 4004e3: push %rbp : 4004e4: mov %rsp,%rbp : 4004e7: mov %rdi,0xffffffffffffffe8(%rbp) : 4004eb: mov %rsi,0xffffffffffffffe0(%rbp) : 4004ef: movl $0x0,0xfffffffffffffff8(%rbp) : 4004f6: jmp 400543 : 4004f8: movl $0x0,0xfffffffffffffffc(%rbp) : 4004ff: jmp 400537 45 3.3482 : 400501: mov 0xfffffffffffffffc(%rbp),%eax : 400504: imul $0x3e8,%eax,%eax 370 27.5298 : 40050a: add 0xfffffffffffffff8(%rbp),%eax : 40050d: cltq : 40050f: shl $0x2,%rax 415 30.8780 : 400513: mov %rax,%rdx : 400516: add 0xffffffffffffffe8(%rbp),%rdx : 40051a: mov 0xfffffffffffffffc(%rbp),%eax 91 6.7708 : 40051d: imul $0x3e8,%eax,%eax 33 2.4554 : 400523: add 0xfffffffffffffff8(%rbp),%eax : 400526: cltq : 400528: shl $0x2,%rax 28 2.0833 : 40052c: add 0xffffffffffffffe0(%rbp),%rax : 400530: mov (%rax),%eax : 400532: mov %eax,(%rdx) 172 12.7976 : 400534: incl 0xfffffffffffffffc(%rbp) : 400537: cmpl $0x3e7,0xfffffffffffffffc(%rbp) : 40053e: jle 400501 : 400540: incl 0xfffffffffffffff8(%rbp) 3 0.2232 : 400543: cmpl $0x3e7,0xfffffffffffffff8(%rbp) : 40054a: jle 4004f8 : 40054c: leaveq : 40054d: retq :Disassembly of section .fini: : :/root/misc/a.out: file format elf64-x86-64 : :Disassembly of section .init: :Disassembly of section .plt: :Disassembly of section .text: : 0000000000400478 : /* touch_row_col total: 187 13.9137 */ : 400478: push %rbp : 400479: mov %rsp,%rbp : 40047c: mov %rdi,0xffffffffffffffe8(%rbp) : 400480: mov %rsi,0xffffffffffffffe0(%rbp) : 400484: movl $0x0,0xfffffffffffffff8(%rbp) : 40048b: jmp 4004d8 : 40048d: movl $0x0,0xfffffffffffffffc(%rbp) : 400494: jmp 4004cc 15 1.1161 : 400496: mov 0xfffffffffffffff8(%rbp),%eax : 400499: imul $0x3e8,%eax,%eax 28 2.0833 : 40049f: add 0xfffffffffffffffc(%rbp),%eax : 4004a2: cltq : 4004a4: shl $0x2,%rax 21 1.5625 : 4004a8: mov %rax,%rdx : 4004ab: add 0xffffffffffffffe8(%rbp),%rdx : 4004af: mov 0xfffffffffffffff8(%rbp),%eax 41 3.0506 : 4004b2: imul $0x3e8,%eax,%eax 10 0.7440 : 4004b8: add 0xfffffffffffffffc(%rbp),%eax : 4004bb: cltq : 4004bd: shl $0x2,%rax 18 1.3393 : 4004c1: add 0xffffffffffffffe0(%rbp),%rax 1 0.0744 : 4004c5: mov (%rax),%eax 2 0.1488 : 4004c7: mov %eax,(%rdx) 48 3.5714 : 4004c9: incl 0xfffffffffffffffc(%rbp) 3 0.2232 : 4004cc: cmpl $0x3e7,0xfffffffffffffffc(%rbp) : 4004d3: jle 400496 : 4004d5: incl 0xfffffffffffffff8(%rbp) : 4004d8: cmpl $0x3e7,0xfffffffffffffff8(%rbp) : 4004df: jle 40048d : 4004e1: leaveq : 4004e2: retq :Disassembly of section .fini: #opcontrol --evnet= is for event. Look data cache miss. > opcontrol --shutdown > opcontrol --reset > opcontrol --event=DATA_CACHE_MISSES:500 > opcontrol --start && ./a.out ; opcontrol --stop #you are able to see that cache miss happen in the touch_col_row. > opreport -l ./a.out CPU: AMD64 processors, speed 1795.54 MHz (estimated) Counted DATA_CACHE_MISSES events (Data cache misses) with a unit mask of 0x00 (No unit mask) count 500 samples % symbol name 3329 92.8850 touch_col_row 255 7.1150 touch_row_col #manual http://oprofile.sourceforge.net/doc/index.html