試玩 USDT

準備工作

裝上工具及 headers

$ sudo apt-get install systemtap-sdt-dev

systemtap 提供了一個叫 dtrace 的工具, 可用與 dtrace 相同語法的 .d 檔產生 tracepoint 定義, 如下

provider my_app
{
    probe loop0(int count);
};

產生 probes.o, 裡面是用來防止無謂的 tracepoint 參數準備用的 semaphore

$ dtrace -s probes.d -G

產生 probes.h 供程序 include

$ dtrace -s probes.d -h

實驗一

單純的 loop, 沒有優化 (-O0), 沒有 tracepoint

/* test.c */
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            /*MY_APP_LOOP0(v);*/
        /*}*/
    }

    return 0;
}

Build

$ gcc -O0 -ggdb3 -o test test.c

產生的 assembly

00000000004004b6 <main>:
#include "probes.h"

int main()
{
  4004b6:       55                      push   %rbp
  4004b7:       48 89 e5                mov    %rsp,%rbp
    int v = 0;
  4004ba:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
    for(; v < (1 << 30); v ++) {
  4004c1:       eb 04                   jmp    4004c7 <main+0x11>
  4004c3:       83 45 fc 01             addl   $0x1,-0x4(%rbp)
  4004c7:       81 7d fc ff ff ff 3f    cmpl   $0x3fffffff,-0x4(%rbp)
  4004ce:       7e f3                   jle    4004c3 <main+0xd>
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            /*MY_APP_LOOP0(v);*/
        /*}*/
    }

    return 0;
  4004d0:       b8 00 00 00 00          mov    $0x0,%eax
}
  4004d5:       5d                      pop    %rbp
  4004d6:       c3                      retq   

執行時間

$ time ./test 

real    0m2.351s
user    0m2.340s
sys     0m0.000s
$ time ./test 

real    0m2.353s
user    0m2.336s
sys     0m0.000s
$ time ./test 

real    0m2.361s
user    0m2.340s
sys     0m0.000s
$ time ./test 

real    0m2.346s
user    0m2.336s
sys     0m0.000s
$ time ./test 

real    0m2.357s
user    0m2.348s
sys     0m0.000s
$ time ./test 

real    0m2.353s
user    0m2.336s
sys     0m0.000s

去掉最高及最低的, 平均為 2.352

實驗二

loop + check + tracepoint, 沒有優化 (-O0)

/* test.c */
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
        if(MY_APP_LOOP0_ENABLED()) {
            MY_APP_LOOP0(v);
        }
    }

    return 0;
}

Build, 用上了 MY_APP_LOOP0_ENABLED(), 會參考到 semaphore, 要跟 probes.o link 一塊

$ gcc -O0 -ggdb3 -o test test.c probes.o

產生的 assembly

00000000004004b6 <main>:
#include "probes.h"

int main()
{
  4004b6:       55                      push   %rbp
  4004b7:       48 89 e5                mov    %rsp,%rbp
    int v = 0;
  4004ba:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
    for(; v < (1 << 30); v ++) {
  4004c1:       eb 14                   jmp    4004d7 <main+0x21>
        if(MY_APP_LOOP0_ENABLED()) {
  4004c3:       0f b7 05 2e 04 20 00    movzwl 0x20042e(%rip),%eax        # 6008f8 <__TMC_END__>
  4004ca:       0f b7 c0                movzwl %ax,%eax
  4004cd:       48 85 c0                test   %rax,%rax
  4004d0:       74 01                   je     4004d3 <main+0x1d>
            MY_APP_LOOP0(v);
  4004d2:       90                      nop
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
  4004d3:       83 45 fc 01             addl   $0x1,-0x4(%rbp)
  4004d7:       81 7d fc ff ff ff 3f    cmpl   $0x3fffffff,-0x4(%rbp)
  4004de:       7e e3                   jle    4004c3 <main+0xd>
        if(MY_APP_LOOP0_ENABLED()) {
            MY_APP_LOOP0(v);
        }
    }

    return 0;
  4004e0:       b8 00 00 00 00          mov    $0x0,%eax
}
  4004e5:       5d                      pop    %rbp
  4004e6:       c3                      retq   

執行時間

$ time ./test                                   

real    0m2.707s
user    0m2.692s
sys     0m0.000s
$ time ./test 

real    0m2.691s
user    0m2.680s
sys     0m0.000s
$ time ./test 

real    0m2.673s
user    0m2.664s
sys     0m0.000s
$ time ./test 

real    0m2.695s
user    0m2.680s
sys     0m0.000s
$ time ./test 

real    0m2.684s
user    0m2.680s
sys     0m0.000s
$ time ./test 

real    0m2.694s
user    0m2.680s
sys     0m0.000s

去掉最高及最低的, 平均為 2.685, 時間增加為無 check tracepoint 是否 enable 時的 114.12%

實驗三

loop + tracepoint, 沒有優化 (-O0)

/* test.c */
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
        /*}*/
    }

    return 0;
}

Build, 用上了 MY_APP_LOOP0_ENABLED(), 會參考到 semaphore, 要跟 probes.o link 一塊

$ gcc -O0 -ggdb3 -o test test.c probes.o

產生的 assembly

00000000004004b6 <main>:
#include "probes.h"

int main()
{
  4004b6:       55                      push   %rbp
  4004b7:       48 89 e5                mov    %rsp,%rbp
    int v = 0;
  4004ba:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
    for(; v < (1 << 30); v ++) {
  4004c1:       eb 05                   jmp    4004c8 <main+0x12>
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
  4004c3:       90                      nop
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
  4004c4:       83 45 fc 01             addl   $0x1,-0x4(%rbp)
  4004c8:       81 7d fc ff ff ff 3f    cmpl   $0x3fffffff,-0x4(%rbp)
  4004cf:       7e f2                   jle    4004c3 <main+0xd>
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
        /*}*/
    }

    return 0;
  4004d1:       b8 00 00 00 00          mov    $0x0,%eax
}
  4004d6:       5d                      pop    %rbp
  4004d7:       c3                      retq   

執行時間

$ time ./test                                                                                                    

real    0m2.468s
user    0m2.460s
sys     0m0.000s
$ time ./test 

real    0m2.476s
user    0m2.460s
sys     0m0.000s
$ time ./test 

real    0m2.454s
user    0m2.448s
sys     0m0.000s
$ time ./test 

real    0m2.473s
user    0m2.456s
sys     0m0.000s
$ time ./test 

real    0m2.477s
user    0m2.460s
sys     0m0.000s
$ time ./test 

real    0m2.462s
user    0m2.456s
sys     0m0.000s

去掉最高及最低的, 平均為 2.4675, 時間增加為實驗一的 104.9%, 為實驗二的 91.9%

實驗四

loop + tracepoint, -O1

/* test.c */
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
        /*}*/
    }

    return 0;
}

Build, 用上了 MY_APP_LOOP0_ENABLED(), 會參考到 semaphore, 要跟 probes.o link 一塊

$ gcc -O0 -ggdb3 -o test test.c probes.o

產生的 assembly

000000000004004b6 <main>:
#include "probes.h"

int main()
{
    int v = 0;
  4004b6:       b8 00 00 00 00          mov    $0x0,%eax
    for(; v < (1 << 30); v ++) {
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
  4004bb:       90                      nop
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
  4004bc:       83 c0 01                add    $0x1,%eax
  4004bf:       3d 00 00 00 40          cmp    $0x40000000,%eax
  4004c4:       75 f5                   jne    4004bb <main+0x5>
            MY_APP_LOOP0(v);
        /*}*/
    }

    return 0;
}
  4004c6:       b8 00 00 00 00          mov    $0x0,%eax
  4004cb:       c3                      retq   

執行時間

$ time ./test 

real    0m0.406s
user    0m0.400s
sys     0m0.000s
$ time ./test 

real    0m0.415s
user    0m0.408s
sys     0m0.000s
$ time ./test 

real    0m0.405s
user    0m0.404s
sys     0m0.000s
$ time ./test 

real    0m0.415s
user    0m0.412s
sys     0m0.000s
$ time ./test 

real    0m0.412s
user    0m0.408s
sys     0m0.000s
$ time ./test 

real    0m0.425s
user    0m0.424s
sys     0m0.000s

去掉最高及最低的, 平均為 0.412

實驗五

loop + tracepoint, -O2

/* test.c */
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
        /*}*/
    }

    return 0;
}

Build, 用上了 MY_APP_LOOP0_ENABLED(), 會參考到 semaphore, 要跟 probes.o link 一塊

$ gcc -O0 -ggdb3 -o test test.c probes.o

產生的 assembly

00000000004003c0 <main>:
#include "probes.h"

int main()
{
    int v = 0;
  4003c0:       31 c0                   xor    %eax,%eax
  4003c2:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
    for(; v < (1 << 30); v ++) {
        /*if(MY_APP_LOOP0_ENABLED()) {*/
            MY_APP_LOOP0(v);
  4003c8:       90                      nop
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
  4003c9:       83 c0 01                add    $0x1,%eax
  4003cc:       3d 00 00 00 40          cmp    $0x40000000,%eax
  4003d1:       75 f5                   jne    4003c8 <main+0x8>
            MY_APP_LOOP0(v);
        /*}*/
    }

    return 0;
}
  4003d3:       31 c0                   xor    %eax,%eax
  4003d5:       c3                      retq   

執行時間

$ time ./test 

real    0m0.398s
user    0m0.388s
sys     0m0.000s
$ time ./test 

real    0m0.384s
user    0m0.380s
sys     0m0.000s
$ time ./test 

real    0m0.386s
user    0m0.384s
sys     0m0.000s
$ time ./test 

real    0m0.380s
user    0m0.376s
sys     0m0.000s
$ time ./test 

real    0m0.382s
user    0m0.380s
sys     0m0.000s
$ time ./test 

real    0m0.381s
user    0m0.380s
sys     0m0.000s

去掉最高及最低的, 平均為 0.38325, 為實驗四的 93.02%

實驗六

loop + check + tracepoint, -O2

/* test.c */
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
        if(MY_APP_LOOP0_ENABLED()) {
            MY_APP_LOOP0(v);
        }
    }

    return 0;
}

Build, 用上了 MY_APP_LOOP0_ENABLED(), 會參考到 semaphore, 要跟 probes.o link 一塊

$ gcc -O0 -ggdb3 -o test test.c probes.o

產生的 assembly

00000000004003c0 <main>:
#include "probes.h"

int main()
{
  4003c0:       0f b7 15 29 05 20 00    movzwl 0x200529(%rip),%edx        # 6008f0 <__TMC_END__>
    int v = 0;
  4003c7:       31 c0                   xor    %eax,%eax
  4003c9:       eb 0f                   jmp    4003da <main+0x1a>
  4003cb:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    for(; v < (1 << 30); v ++) {
  4003d0:       83 c0 01                add    $0x1,%eax
  4003d3:       3d 00 00 00 40          cmp    $0x40000000,%eax
  4003d8:       74 16                   je     4003f0 <main+0x30>
        if(MY_APP_LOOP0_ENABLED()) {
  4003da:       66 85 d2                test   %dx,%dx
  4003dd:       74 f1                   je     4003d0 <main+0x10>
            MY_APP_LOOP0(v);
  4003df:       90                      nop
#include "probes.h"

int main()
{
    int v = 0;
    for(; v < (1 << 30); v ++) {
  4003e0:       83 c0 01                add    $0x1,%eax
  4003e3:       3d 00 00 00 40          cmp    $0x40000000,%eax
  4003e8:       75 f0                   jne    4003da <main+0x1a>
  4003ea:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
            MY_APP_LOOP0(v);
        }
    }

    return 0;
}
  4003f0:       31 c0                   xor    %eax,%eax
  4003f2:       c3                      retq   

執行時間

$ time ./test 

real    0m0.388s
user    0m0.384s
sys     0m0.000s
$ time ./test 

real    0m0.393s
user    0m0.392s
sys     0m0.000s
$ time ./test 

real    0m0.385s
user    0m0.384s
sys     0m0.004s
$ time ./test 

real    0m0.387s
user    0m0.384s
sys     0m0.000s
$ time ./test 

real    0m0.394s
user    0m0.392s
sys     0m0.000s
$ time ./test 

real    0m0.383s
user    0m0.380s
sys     0m0.000s

去掉最高及最低的, 平均為 0.38825, 為實驗四的 94.235%, 實驗五的 101.3%

實驗七

loop, -O1

[email protected]:~/test-usdt$ time ./test 

real    0m0.397s
user    0m0.392s
sys     0m0.000s
[email protected]:~/test-usdt$ time ./test 

real    0m0.397s
user    0m0.396s
sys     0m0.000s
[email protected]:~/test-usdt$ time ./test 

real    0m0.389s
user    0m0.388s
sys     0m0.000s
[email protected]:~/test-usdt$ time ./test 

real    0m0.393s
user    0m0.392s
sys     0m0.000s
[email protected]:~/test-usdt$ time ./test 

real    0m0.399s
user    0m0.396s
sys     0m0.000s
[email protected]:~/test-usdt$ time ./test 

real    0m0.397s
user    0m0.396s
sys     0m0.000s

實驗八

loop, -O2, -fno-unroll-loops 還沒找到方法