Linux内核 eBPF基础
Tracepoint原理源码分析
荣涛
2021年5月10日

1. 基本原理

需要注意的几点:

  • 本文将从sched_switch相关的tracepoint展开;
  • 关于static_key,请详见本博客jump-label相关文章Linux Jump Label(x86)、Linux Jump Label/static-key机制详解;
  • 可参见内核注释版代码:https://github.com/Rtoax/linux-5.10.13

1.1. 源码头文件结构

首先看tracepoint源码文件结构:

  • include/linux/tracepoint-defs.h
    #include <linux/atomic.h>#include <linux/static_key.h>
  • include/linux/tracepoint.h
    #include <linux/tracepoint-defs.h>#include <linux/static_call.h>
  • include/linux/trace_events.h
    #include <linux/trace_seq.h>#include <linux/perf_event.h>#include <linux/tracepoint.h>
  • include/trace/trace_events.h
    #include <linux/trace_events.h>
  • include/trace/define_trace.h
    #ifdef TRACEPOINTS_ENABLED#include <trace/trace_events.h>#endif
  • include/trace/events/sched.h
    #include <linux/tracepoint.h>#include <trace/define_trace.h>
  • kernel/trace/trace_sched_switch.c
    #include <trace/events/sched.h>
  • kernel/sched/core.c
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
#undef CREATE_TRACE_POINTS

1.1.1. include/linux/tracepoint-defs.h

该源文件定义了基本数据结构,如struct tracepoint

struct tracepoint {         /* 跟踪点 */const char *name;        /* Tracepoint name */struct static_key key;  /* static_key */struct static_call_key *static_call_key;void *static_call_tramp;/*  */void *iterator;         /*  */int (*regfunc)(void);   /* 注册函数 */void (*unregfunc)(void);/* 注销函数 */struct tracepoint_func __rcu *funcs;
};

1.1.2. include/linux/tracepoint.h

该头文件定义了相关的注册注销api,如下:

extern int
tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
extern int
tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data);

并给出了关键的宏定义,如下:

  • __DO_TRACE:运行实际的trace函数;
  • __DECLARE_TRACE:声明tracepoint结构,定义静态的trace函数;
  • DEFINE_TRACE_FN:定义tracepoint数据结构,static-key数据结构;
  • DEFINE_TRACE:对DEFINE_TRACE_FN的封装;
  • DECLARE_TRACE:对__DECLARE_TRACE的封装;
  • DECLARE_TRACE_CONDITION:同上,多了条件变量;

1.1.2.1. __DO_TRACE

#define __DO_TRACE(name, proto, args, cond, rcuidle)            \do {                               \struct tracepoint_func *it_func_ptr;           \int __maybe_unused __idx = 0;              \void *__data;                      \\if (!(cond))                      \return;                        \\/* srcu can't be used from NMI */           \WARN_ON_ONCE(rcuidle && in_nmi());          \\/* keep srcu and sched-rcu usage consistent */     \preempt_disable_notrace();                \\/*                            \* For rcuidle callers, use srcu since sched-rcu    \* doesn't work from the idle path.         \*/                          \if (rcuidle) {                     \__idx = srcu_read_lock_notrace(&tracepoint_srcu);\rcu_irq_enter_irqson();                \}                           \\it_func_ptr =                     \rcu_dereference_raw((&__tracepoint_##name)->funcs); \if (it_func_ptr) {                  \__data = (it_func_ptr)->data;          \__DO_TRACE_CALL(name)(args);            \}                           \\if (rcuidle) {                        \rcu_irq_exit_irqson();                \srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\}                            \\preempt_enable_notrace();                \} while (0)

这里我们只关注__DO_TRACE_CALL(name)(args);,他的定义为:

#ifdef CONFIG_HAVE_STATIC_CALL
#define __DO_TRACE_CALL(name)   static_call(tp_func_##name)
#else
#define __DO_TRACE_CALL(name)   __traceiter_##name
#endif/* CONFIG_HAVE_STATIC_CALL */

其中__traceiter_##name函数是在DEFINE_TRACE_FN中定义的,static_call(tp_func_##name)也是在DEFINE_TRACE_FN中定义的。

1.1.2.2. __DECLARE_TRACE

#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \extern int __traceiter_##name(data_proto);         \DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);   \extern struct tracepoint __tracepoint_##name;          \static inline void trace_##name(proto)             \{                              \if (static_key_false(&__tracepoint_##name.key))        \__DO_TRACE(name,               \TP_PROTO(data_proto),          \TP_ARGS(data_args),            \TP_CONDITION(cond), 0);            \if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {        \rcu_read_lock_sched_notrace();         \rcu_dereference_sched(__tracepoint_##name.funcs);\rcu_read_unlock_sched_notrace();     \}                          \}                              \__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),     \PARAMS(cond), PARAMS(data_proto), PARAMS(data_args))   \static inline int                      \register_trace_##name(void (*probe)(data_proto), void *data)   \{                              \return tracepoint_probe_register(&__tracepoint_##name, \(void *)probe, data);  \}                              \static inline int                      \register_trace_prio_##name(void (*probe)(data_proto), void *data,\int prio)                \{                              \return tracepoint_probe_register_prio(&__tracepoint_##name, \(void *)probe, data, prio); \}                                \static inline int                      \unregister_trace_##name(void (*probe)(data_proto), void *data) \{                              \return tracepoint_probe_unregister(&__tracepoint_##name,\(void *)probe, data); \}                              \static inline void                     \check_trace_callback_type_##name(void (*cb)(data_proto))   \{                              \}                              \static inline bool                     \trace_##name##_enabled(void)                   \{                              \return static_key_false(&__tracepoint_##name.key); \}

其定义的函数名如下(以sched_switch为例):

  • trace_sched_switch;
  • register_trace_sched_switch;
  • register_trace_prio_sched_switch;
  • unregister_trace_sched_switch;
  • check_trace_callback_type_sched_switch;
  • trace_sched_switch_enabled;

以上这些函数将在kernel/trace/trace_sched_switch.c中被调用。

1.1.2.3. DEFINE_TRACE_FN

#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args)/*  */    \static const char __tpstrtab_##_name[]              \__section("__tracepoints_strings") = #_name;            \extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \int __traceiter_##_name(void *__data, proto);            \struct tracepoint __tracepoint_##_name  __used          \__section("__tracepoints") = {                  \.name = __tpstrtab_##_name,              \.key = STATIC_KEY_INIT_FALSE,                \.static_call_key = &STATIC_CALL_KEY(tp_func_##_name),  \.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \.iterator = &__traceiter_##_name,         \.regfunc = _reg,                 \.unregfunc = _unreg,                 \.funcs = NULL };                  \__TRACEPOINT_ENTRY(_name);                    \int __traceiter_##_name(void *__data, proto)            \{                               \struct tracepoint_func *it_func_ptr;         \void *it_func;                       \\it_func_ptr =                     \rcu_dereference_raw((&__tracepoint_##_name)->funcs); \do {                         \it_func = (it_func_ptr)->func;         \__data = (it_func_ptr)->data;          \((void(*)(void *, proto))(it_func))(__data, args); \} while ((++it_func_ptr)->func);            \return 0;                      \}                               \DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);

其定义的结构如下(以sched_switch为例):

  • struct tracepoint __tracepoint_sched_switch
  • 函数__traceiter_sched_switch
  • DEFINE_STATIC_CALL(tp_func_sched_switch, __traceiter_sched_switch)

这些函数是在__DO_TRACE中被调用的。

1.1.3. include/linux/trace_events.h

该文件中定义了以下数据结构

  • trace_iterator

他的注释为:

/** Trace iterator - used by printout routines who present trace* results to users and which routines might sleep, etc:*/

也就是说,它的出现是为了解决trace和user print的之间异步的问题。

  • struct trace_event
struct trace_event {    /* trace 时间 */struct hlist_node     node;struct list_head     list;int              type;struct trace_event_functions *funcs;
};

声明了两个函数:

extern int register_trace_event(struct trace_event *event);
extern int unregister_trace_event(struct trace_event *event);

注册函数register_trace_event是将trace_event添加至链表(哈希表):

    list_add_tail(&event->list, list);hlist_add_head(&event->node, &event_hash[key]);

注销函数unregister_trace_event是将trace_event从链表中删除:

    hlist_del(&event->node);list_del(&event->list);
  • struct trace_event_class
  • struct trace_event_buffer
  • struct trace_event_call

具体情况具体分析。

1.1.4. include/trace/trace_events.h

文件中通过undefdefineTRACE_INCLUDE将该文件分为几个stage,我分步说明。

首先看下trace文件目录结构:

include/trace
.
├── bpf_probe.h
├── define_trace.h
├── events
│   ├── 9p.h
│   ...
│   ├── xdp.h
│   └── xen.h
├── perf.h
├── syscall.h
└── trace_events.h -> 本文件include/trace/trace_events.h

1.1.4.1. Stage 1 of the trace events.

/** Stage 1 of the trace events.** Override the macros in the event tracepoint header <trace/events/XXX.h>* to include the following:** struct trace_event_raw_<call> {*    struct trace_entry      ent;*   <type>                <item>;*  <type2>               <item2>[<len>];*    [...]* };** The <type> <item> is created by the __field(type, item) macro or* the __array(type2, item2, len) macro.* We simply do "type item;", and that will create the fields* in the structure.*/

后面的我们会在具体sched的trace中讲解。

1.1.5. include/trace/define_trace.h

该文件中为一些宏定义,以及包含以下头文件:

#ifdef TRACEPOINTS_ENABLED
#include <trace/trace_events.h>
#include <trace/perf.h>
#include <trace/bpf_probe.h>
#endif

1.1.6. include/trace/events/sched.h

这是具体的函数和结构体的声明,如本文关注的:

/** Tracepoint for task switches, performed by the scheduler:*/
TRACE_EVENT(sched_switch,TP_PROTO(bool preempt,struct task_struct *prev,struct task_struct *next),TP_ARGS(preempt, prev, next),TP_STRUCT__entry(__array(    char, prev_comm,   TASK_COMM_LEN   )__field( pid_t,   prev_pid            )__field( int,  prev_prio           )__field( long, prev_state          )__array( char, next_comm,   TASK_COMM_LEN   )__field( pid_t,   next_pid            )__field( int,  next_prio           )),TP_fast_assign(memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);__entry->prev_pid   = prev->pid;__entry->prev_prio = prev->prio;__entry->prev_state   = __trace_sched_switch_state(preempt, prev);memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);__entry->next_pid = next->pid;__entry->next_prio = next->prio;/* XXX SCHED_DEADLINE */),TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?__print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",{ TASK_INTERRUPTIBLE, "S" },{ TASK_UNINTERRUPTIBLE, "D" },{ __TASK_STOPPED, "T" },{ __TASK_TRACED, "t" },{ EXIT_DEAD, "X" },{ EXIT_ZOMBIE, "Z" },{ TASK_PARKED, "P" },{ TASK_DEAD, "I" }) :"R",__entry->prev_state & TASK_REPORT_MAX ? "+" : "",__entry->next_comm, __entry->next_pid, __entry->next_prio)
);

1.1.7. kernel/sched/core.c

该文件包含了上节的头文件:

#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
#undef CREATE_TRACE_POINTS

__schedule有将调用静态跟踪点trace_sched_switch

static void __sched notrace __schedule(bool preempt)
{...trace_sched_switch(preempt, prev, next);...
}

1.2. 基本原理

下面从kernel/sched/core.c出发,从顶向下分析trace_sched_switch

  • kernel/sched/core.c中包含头文件include/trace/events/sched.h,并定义CREATE_TRACE_POINTS
  • include/trace/events/sched.h中包含了include/linux/tracepoint.h
  • 按照以上小节中的顺序依次进行了包含关系;

首先在include/linux/tracepoint.h中定义了TRACE_EVENTDECLARE_TRACE;经过一系列追踪,TRACE_EVENT(sched_switch也将进行一系列的展开。

TRACE_EVENT几个关键的定义如下:

  • include/trace/trace_events.h
  • include/trace/define_trace.h
  • include/linux/tracepoint.h

1.2.1. include/trace/define_trace.h

#define TRACE_EVENT(name, proto, args, tstruct, assign, print)  \DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
#define DEFINE_TRACE(name, proto, args)     \DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));

展开后为:

static const char __section("__tracepoints_strings") __tpstrtab_sched_switch[] = "sched_switch";
extern struct static_call_key STATIC_CALL_KEY(tp_func_sched_switch);
int __traceiter_sched_switch(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next);
struct tracepoint __used __section("__tracepoints") __tracepoint_sched_switch = {                  .name = __tpstrtab_sched_switch,             .key = STATIC_KEY_INIT_FALSE,               .static_call_key = &STATIC_CALL_KEY(tp_func_sched_switch),   .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_sched_switch), .iterator = &__traceiter_sched_switch,.regfunc = NULL,                    .unregfunc = NULL,.funcs = NULL
};
asm("   .section \"__tracepoints_ptrs\", \"a\"      \n" \"   .balign 4                   \n" \"   .long   __tracepoint_sched_switch - .      \n" \"   .previous                   \n");int __traceiter_sched_switch(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next)
{                               struct tracepoint_func *it_func_ptr;            void *it_func;                      it_func_ptr =                       rcu_dereference_raw((&__tracepoint_sched_switch)->funcs); do {                            it_func = (it_func_ptr)->func;          __data = (it_func_ptr)->data;           ((void(*)(void *, bool preempt, struct task_struct *prev, struct task_struct *next))(it_func))(__data, preempt, prev, next); } while ((++it_func_ptr)->func);            return 0;
}
DEFINE_STATIC_CALL(tp_func_sched_switch, __traceiter_sched_switch);

1.2.2. include/linux/tracepoint.h

#define TRACE_EVENT(name, proto, args, struct, assign, print)   \DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))

展开为:

extern int __traceiter_sched_switch(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next);
DECLARE_STATIC_CALL(tp_func_sched_switch, __traceiter_sched_switch);
extern struct tracepoint __tracepoint_sched_switch; static inline void trace_sched_switch(bool preempt, struct task_struct *prev, struct task_struct *next)
{if (static_key_false(&__tracepoint_sched_switch.key)) {struct tracepoint_func *it_func_ptr;int __maybe_unused __idx = 0;void *__data;if (!(cpu_online(raw_smp_processor_id())))return;/* srcu can't be used from NMI */WARN_ON_ONCE(0 && in_nmi());/* keep srcu and sched-rcu usage consistent */preempt_disable_notrace()/** For rcuidle callers, use srcu since sched-rcu* doesn't work from the idle path.*/if (0) {__idx = srcu_read_lock_notrace(&tracepoint_srcu);rcu_irq_enter_irqson();}it_func_ptr =                      rcu_dereference_raw((&__tracepoint_sched_switch)->funcs); if (it_func_ptr) {                  __data = (it_func_ptr)->data;
//          __DO_TRACE_CALL(sched_switch)(__data, preempt, prev, next);__traceiter_sched_switch(__data, preempt, prev, next);  /* 这里最终对应这个函数 */}if (0) {                      rcu_irq_exit_irqson();             srcu_read_unlock_notrace(&tracepoint_srcu, __idx);}                          preempt_enable_notrace();              } if (IS_ENABLED(CONFIG_LOCKDEP) && (cpu_online(raw_smp_processor_id()))) {     rcu_read_lock_sched_notrace();          rcu_dereference_sched(__tracepoint_sched_switch.funcs);rcu_read_unlock_sched_notrace();        }
}                               #ifndef MODULE
static inline void trace_sched_switch_rcuidle(proto)
{                                if (static_key_false(&__tracepoint_sched_switch.key))     /* 同上static_key_false 部分代码  */
}
#endifstatic inline int
register_trace_sched_switch(void (*probe)(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next), void *data)
{                               return tracepoint_probe_register(&__tracepoint_sched_switch,  (void *)probe, data);
}
static inline int
register_trace_prio_sched_switch(void (*probe)(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next),void *data,int prio)
{                               return tracepoint_probe_register_prio(&__tracepoint_sched_switch, (void *)probe, data, prio);
}
static inline int
unregister_trace_sched_switch(void (*probe)(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next), void *data)
{                               return tracepoint_probe_unregister(&__tracepoint_sched_switch,(void *)probe, data);
}
static inline void
check_trace_callback_type_sched_switch(void (*cb)(void *__data, bool preempt, struct task_struct *prev, struct task_struct *next))
{
}
static inline bool
trace_sched_switch_enabled(void)
{                               return static_key_false(&__tracepoint_sched_switch.key);
}

1.2.3. include/trace/trace_events.h

#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \DECLARE_EVENT_CLASS(name,                  \PARAMS(proto),             \PARAMS(args),              \PARAMS(tstruct),               \PARAMS(assign),            \PARAMS(print));            \DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));

TODO

至此,trace_sched_switch的结构就简单明亮了。

2. 实例分析TRACE_EVENT(sched_switch, ...)

在源代码include/linux/tracepoint.h中有这样的注释:

/** For use with the TRACE_EVENT macro:** We define a tracepoint, its arguments, its printk format* and its 'fast binary record' layout.** Firstly, name your tracepoint via TRACE_EVENT(name : the* 'subsystem_event' notation is fine.** Think about this whole construct as the* 'trace_sched_switch() function' from now on.***  TRACE_EVENT(sched_switch,**    **  * A function has a regular function arguments*  * prototype, declare it via TP_PROTO():*    *** TP_PROTO(struct rq *rq, struct task_struct *prev,*       struct task_struct *next),**   **  * Define the call signature of the 'function'.* * (Design sidenote: we use this instead of a*   *  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)*    *** TP_ARGS(rq, prev, next),**  **  * Fast binary tracing: define the trace record via* * TP_STRUCT__entry(). You can think about it like a*    * regular C structure local variable definition.*   **  * This is how the trace record is structured and will*  * be saved into the ring buffer. These are the fields*  * that will be exposed to user-space in*    * /sys/kernel/debug/tracing/events/<*>/format.*   **  * The declared 'local variable' is called '__entry'*    **  * __field(pid_t, prev_prid) is equivalent to a standard declariton:*    **  *   pid_t   prev_pid;*  **  * __array(char, prev_comm, TASK_COMM_LEN) is equivalent to:*    **  *   char    prev_comm[TASK_COMM_LEN];*  *** TP_STRUCT__entry(*      __array(    char,   prev_comm,  TASK_COMM_LEN   )*      __field(    pid_t,  prev_pid            )*      __field(    int,    prev_prio           )*      __array(    char,   next_comm,  TASK_COMM_LEN   )*      __field(    pid_t,  next_pid            )*      __field(    int,    next_prio           )*  ),**    **  * Assign the entry into the trace record, by embedding* * a full C statement block into TP_fast_assign(). You*  * can refer to the trace record as '__entry' -* * otherwise you can put arbitrary C code in here.*  **  * Note: this C code will execute every time a trace event*  * happens, on an active tracepoint.*    *** TP_fast_assign(*        memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);*       __entry->prev_pid    = prev->pid;*        __entry->prev_prio   = prev->prio;*       memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);*       __entry->next_pid    = next->pid;*        __entry->next_prio   = next->prio;*   ),**    **  * Formatted output of a trace record via TP_printk().*  * This is how the tracepoint will appear under ftrace*  * plugins that make use of this tracepoint.*    **  * (raw-binary tracing wont actually perform this step.)*    *** TP_printk("task %s:%d [%d] ==> %s:%d [%d]",*     __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,*        __entry->next_comm, __entry->next_pid, __entry->next_prio),** );** This macro construct is thus used for the regular printk format* tracing setup, it is used to construct a function pointer based* tracepoint callback (this is used by programmatic plugins and* can also by used by generic instrumentation like SystemTap), and* it is also used to expose a structured trace record in* /sys/kernel/debug/tracing/events/.** A set of (un)registration functions can be passed to the variant* TRACE_EVENT_FN to perform any (un)registration work.*/

2.1. trace_sched_switch调用

__schedule中有对trace_sched_switch这样的调用:

static void __sched notrace __schedule(bool preempt)
{...trace_sched_switch(preempt, prev, next);...
}

首先,将判断if (static_key_false(&__tracepoint_sched_switch.key)),当trace未开启是,trace_sched_switch的开销是微乎其微的。那么trace开关值怎么开启的呢?函数trace_sched_switch_enabled判断tracepoint是否被打开。

static inline bool
trace_sched_switch_enabled(void)
{return static_key_false(&__tracepoint_sched_switch.key);
}

2.2. 内核支持哪些追踪点

2.2.1. tplist命令

这里引入一个命令tplist,man如下:

tplist(8)                                    System Manager's Manual                                   tplist(8)NAMEtplist - Display kernel tracepoints or USDT probes and their formats.SYNOPSIStplist [-p PID] [-l LIB] [-v] [filter]DESCRIPTIONtplist  lists  all  kernel  tracepoints,  and can optionally print out the tracepoint format; namely, thevariables that you can trace when the tracepoint is hit.  tplist can also list USDT probes embedded in  aspecific  library  or  executable,  and  can  list USDT probes for all the libraries loaded by a specificprocess.  These features are usually used in conjunction with the argdist and/or trace tools.On a typical system, accessing the tracepoint list and format requires  root.   However,  accessing  USDTprobes does not require root.

首选用命令查看是否支持:

# tplist | grep sched_switch
sched:sched_switch

为了知道是如何查询的,用strace追踪一下:

stat("/sys/kernel/debug/tracing/events/sched/sched_switch", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0

可见tplist实际上是使用sys文件系统的/sys/kernel/debug/tracing/events/sched/sched_switch目录,看下目录下包含的文件:

/sys/kernel/debug/tracing/events/sched/sched_switch/
├── enable
├── filter
├── format
└── id

查看这个tracepoint的默认状态:

cat /sys/kernel/debug/tracing/events/sched/sched_switch/enable
0

2.2.2. perf list

在perf list中同样可以找到这个追踪点

sched:sched_switch                                 [Tracepoint event]

2.2.3. bpf

同样,使用bcc和bpftrace等bpf前端工具也是非常容易找到这个追踪点的,这里不再详述。

2.3. 跟踪点如何开启

通过上面的讨论可见,这个追踪点默认是关闭的,那么如何开启的呢?很简单

echo 1 > /sys/kernel/debug/tracing/events/sched/sched_switch/enable

2.4. 程序运行到跟踪点会执行什么

参见上面讲到的几个函数:

  • register_trace_sched_switch
  • register_trace_prio_sched_switch
  • unregister_trace_sched_switch

2.4.1. ftrace

ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
if (ret)pr_info("ftrace_graph: Couldn't activate tracepoint"" probe to kernel_sched_switch\n");

程序运行到跟踪点时,函数ftrace_graph_probe_sched_switch将被执行,就是一些追踪

static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,struct task_struct *prev, struct task_struct *next)
{unsigned long long timestamp;int index;/** Does the user want to count the time a function was asleep.* If so, do not update the time stamps.*/if (fgraph_sleep_time)return;timestamp = trace_clock_local();prev->ftrace_timestamp = timestamp;/* only process tasks that we timestamped */if (!next->ftrace_timestamp)return;/** Update all the counters in next to make up for the* time next was sleeping.*/timestamp -= next->ftrace_timestamp;for (index = next->curr_ret_stack; index >= 0; index--)next->ret_stack[index].calltime += timestamp;
}

还有ftrace_filter_pid_sched_switch_probe

2.4.2. probe

    ret = register_trace_sched_switch(probe_sched_switch, NULL);if (ret) {pr_info("sched trace: Couldn't activate tracepoint"" probe to kernel_sched_switch\n");goto fail_deprobe_wake_new;}

还有另一个

    ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);if (ret) {pr_info("sched trace: Couldn't activate tracepoint"" probe to kernel_sched_switch\n");goto fail_deprobe_wake_new;}

其中probe_sched_switch如下:

static void
probe_sched_switch(void *ignore, bool preempt,struct task_struct *prev, struct task_struct *next)
{int flags;flags = (RECORD_TGID * !!sched_tgid_ref) +(RECORD_CMDLINE * !!sched_cmdline_ref);if (!flags)return;tracing_record_taskinfo_sched_switch(prev, next, flags);
}

具体的tracepoint调用的函数不在本文介绍和研究。

3. 结论

综上,tracepoint是基于jump-label的,关于jump-label详情参见文末链接。同时,将tracing功能和文件系统挂钩,简化用户使用流程,要查看系统支持哪些tracepoint,可以只用perf listtplist指令查看,或者直接查看debugfs或者proc文件系统。

4. 相关阅读

  • Linux Jump Label(x86)
  • Linux Jump Label/static-key机制详解;

Linux内核 eBPF基础:Tracepoint原理源码分析相关推荐

  1. Linux内核 eBPF基础:ftrace源码分析:过滤函数和开启追踪

    Linux内核 eBPF基础 ftrace基础:过滤函数和开启追踪 荣涛 2021年5月12日 本文相关注释代码:https://github.com/Rtoax/linux-5.10.13 上篇文章 ...

  2. Linux内核 eBPF基础:kprobe原理源码分析:源码分析

    Linux内核 eBPF基础 kprobe原理源码分析:源码分析 荣涛 2021年5月11日 在 <Linux内核 eBPF基础:kprobe原理源码分析:基本介绍与使用>中已经介绍了kp ...

  3. Linux内核 eBPF基础:kprobe原理源码分析:基本介绍与使用示例

    Linux内核 eBPF基础 kprobe原理源码分析:基本介绍与使用示例 荣涛 2021年5月11日 kprobe调试技术是为了便于跟踪内核函数执行状态所设计的一种轻量级内核调试技术. 利用kpro ...

  4. Linux内核 eBPF基础:ftrace基础-ftrace_init初始化

    Linux内核 eBPF基础 ftrace基础:ftrace_init初始化 荣涛 2021年5月12日 本文相关注释代码:https://github.com/Rtoax/linux-5.10.13 ...

  5. Linux内核 eBPF基础:perf(4)perf_event_open系统调用与用户手册详解

    Linux内核 eBPF基础 perf(4)perf_event_open系统调用与用户手册详解 荣涛 2021年5月19日 本文相关注释代码:https://github.com/Rtoax/lin ...

  6. Linux内核 eBPF基础:perf(1):perf_event在内核中的初始化

    Linux内核 eBPF基础 perf(1):perf_event在内核中的初始化 荣涛 2021年5月12日 本文相关注释代码:https://github.com/Rtoax/linux-5.10 ...

  7. Linux内核 eBPF基础:perf(2):perf性能管理单元PMU的注册

    Linux内核 eBPF基础 perf(2):性能管理单元PMU的注册 荣涛 2021年5月18日 本文相关注释代码:https://github.com/Rtoax/linux-5.10.13 Li ...

  8. 动态代理原理源码分析

    看了这篇文章非常不错转载:https://www.jianshu.com/p/4e14dd223897 Java设计模式(14)----------动态代理原理源码分析 上篇文章<Java设计模 ...

  9. Linux内核 eBPF基础:BCC (BPF Compiler Collection)

    目录 BCC包括的一些工具 安装BCC 常用工具示例 capable tcpconnect tcptop 扩展工具 简单示例 使用BPF_PERF_OUTPUT 用户自定义探针示例 参考 BPF Co ...

最新文章

  1. 我是如何利用“王宝强离婚”事件来吸粉的
  2. java 克隆对象工具类_关于dorado-core源码包中CloneUtils克隆工具类对对象进行克隆复制操作...
  3. 信息保护:从经典纠错到量子面膜
  4. pytorch 常用的 loss function
  5. 史上最全java堆,将知识点掰碎了给你嚼,还不信学不会。
  6. easyui关机图标_如何在Windows 10中创建关机图标
  7. 从零开始学Pytorch(十五)之数据增强
  8. mooc作业怎么上传附件_中国大学MOOC最新考试系统(老师视角)增加学生作弊成本...
  9. linux oracle查询乱码问题,linux中oracle中文乱码解决方法
  10. Windows核心编程MFC_gdi+发光字
  11. 一个简单的makefile编写(gcc)
  12. 如何使用Fastreport .将报表从 Crystal Reports 导入 FastReport .NET
  13. 卡巴斯基2010激活码
  14. 用phpexcel导入导出文件
  15. linux系统下mysql编码格式,Windows、Linux系统下mysql编码设置
  16. 妙用AccessibilityService黑科技实现微信自动加好友拉人进群聊
  17. 仿淘宝购买详情页购买缩小动画
  18. unity百度AI人体分析
  19. 一些有用的Google Hack
  20. selenium webdriver的testNG框架的介绍及使用

热门文章

  1. VS2010中水晶报表安装应用及实例
  2. ORA-00257 archiver error. 错误的处理方法
  3. php写的注册登录系统吗,php注册登录系统简化版_php技巧
  4. 最优化设置mysql的max_connections
  5. linux网络IO模型
  6. 基于XML的AOP配置
  7. gulp常用组件【study笔记】
  8. wordpress如何屏蔽wp-json(禁用REST API)
  9. ubuntu 设置php开机启动
  10. 可以种树吗_基本农田能干什么?种树?建房?搞种养殖?哪种可以?