可观测性 kprobe
介绍
kprobe是一种动态调试机制,能够动态地中断内核处理流程并无中断地收集调试和性能信息,可以在内核的绝大多数指定函数中动态插入探测点来收集所需的调试状态信息而基本不影响内核原有的执行流程,kprobe不用修改内核源码,是指令集的探测技术。
有三种方式来达到kprobe的目的。
- 编写内核模块,增加kprobe hook。
- 使用ftrace 监控kprobe event。
- 使用ebpf 增加kprobe hook。
本篇只讲第一种。
工作原理
工作机制如图所示,
当注册一个kprobe的时候,把addr位置的指令修改为brk指令。
CPU执行到addr处trap到pre_handler执行。
pre_handler执行完毕后产生debug异常执行post_handler。
CPU继续执行addr处的指令。
kprobe的管理
初始化
init_kprobes
初始化kprobe hash表kprobe_table;采集黑名单 kretprobe_blacklist; 异常notifier 以及模块notifier。
static int __init init_kprobes(void)
{
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
INIT_HLIST_HEAD(&kprobe_table[i]);
INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
}
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
...
...
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
if (!err)
err = register_module_notifier(&kprobe_module_nb);
...
}
kprobe 实例存放在全局的hash表中,以函数的地址做hash值。
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
struct kprobe {
struct hlist_node hlist;
/* list of kprobes for multi-handler support */
struct list_head list;
/*count the number of times this probe was temporarily disarmed */
unsigned long nmissed;
/* location of the probe point */
kprobe_opcode_t *addr;
/* Allow user to indicate symbol name of the probe point */
const char *symbol_name;
/* Offset into the symbol */
unsigned int offset;
/* Called before addr is executed. */
kprobe_pre_handler_t pre_handler;
/* Called after addr is executed, unless... */
kprobe_post_handler_t post_handler;
/*
* ... called if executing addr causes a fault (eg. page fault).
* Return 1 if it handled fault, otherwise kernel will see it.
*/
kprobe_fault_handler_t fault_handler;
/* Saved opcode (which has been replaced with breakpoint) */
kprobe_opcode_t opcode;
/* copy of the original instruction */
struct arch_specific_insn ainsn;
/*
* Indicates various status flags.
* Protected by kprobe_mutex after this kprobe is registered.
*/
u32 flags;
}
动态的注册一个 kprobe, 加入到 kprobe_table hash表中,把要采集的函数地址addr替换为brk指令。
int register_kprobe(struct kprobe *p)
{
...
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
ret = arm_kprobe(p);
if (ret) {
hlist_del_rcu(&p->hlist);
synchronize_rcu();
goto out;
}
}
...
}
kprobe的处理是分体系结构的,不同的架构指令值不同。以x86为例子,在arch_arm_kprobe把addr替换成brk。
/arch/x86/include/asm/kprobes.h
#define BREAKPOINT_INSTRUCTION 0xcc
arm_kprobe ---> arm_kprobe ----> arch_arm_kprobe
void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
操作API
注册/去注册
int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);
使能/去使能
int enable_kprobe(struct kprobe *kp);
int disable_kprobe(struct kprobe *kp);
获取kprobe
/* Get the kprobe at this addr (if any) - called with preemption disabled */
struct kprobe *get_kprobe(void *addr);
/* kprobe_running() will just return the current_kprobe on this CPU */
static inline struct kprobe *kprobe_running(void)
{
return (__this_cpu_read(current_kprobe));
}
hook点
hook的选取原则
内核函数很多,但是并不是所有的函数都适合做hook,inline函数无法作为hook、static函数有可能会被优化。所以hook一般会在 /proc/kallsyms 中选取。
hook的执行
When a kprobe is registered, Kprobes makes a copy of the probed instruction and replaces the first byte(s) of the probed instruction with a breakpoint instruction (e.g., int3 on i386 and x86_64).
注册 kprobe 后,Kprobes 会复制被探测的指令,并用断点指令(例如 i386 和 x86_64 上的 int3)替换被探测指令的第一个字节。
When a CPU hits the breakpoint instruction, a trap occurs, the CPU’s registers are saved, and control passes to Kprobes via the notifier_call_chain mechanism. Kprobes executes the “pre_handler” associated with the kprobe, passing the handler the addresses of the kprobe struct and the saved registers.
当 CPU 遇到断点指令时,会发生陷阱,保存 CPU 的寄存器,并通过 notifier_call_chain 机制将控制权传递给 Kprobes。 Kprobes 在函数 kprobe_handler中 执行与 kprobe 相关的“pre_handler”,将 kprobe 结构的地址和保存的寄存器传递给处理程序。 对于x86、arm64体系架构 notifier_call_chain 通知链回调函数并没有实际的作用了。
Next, Kprobes single-steps its copy of the probed instruction. (It would be simpler to single-step the actual instruction in place, but then Kprobes would have to temporarily remove the breakpoint instruction. This would open a small time window when another CPU could sail right past the probepoint.)
接下来,Kprobes 单步执行其探测指令的副本。(单步执行实际指令会更简单,但 Kprobes 将不得不暂时删除断点指令,这将打开一个小的时间窗口,此时另一个 CPU 可以直接越过探测点。)
After the instruction is single-stepped, Kprobes executes the “post_handler,” if any, that is associated with the kprobe. Execution then continues with the instruction following the probepoint.
在指令单步执行后,会产生debug异常,Kprobes 执行与 kprobe 关联的“post_handler”(如果有), 然后继续执行探测点之后的指令。
- pre_handler
在执行被探测函数之前调用,在
do_int3
中执行。
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
...
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
goto exit;
#endif
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
cond_local_irq_enable(regs);
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
cond_local_irq_disable(regs);
...
}
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
*/
int kprobe_int3_handler(struct pt_regs *regs)
{
...
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire duration of kprobe
* processing. Since int3 and debug trap disables irqs and we clear
* IF while singlestepping, it must be no preemptible.
*/
kcb = get_kprobe_ctlblk();
p = get_kprobe(addr);
if (p) {
...
set_current_kprobe(p, regs, kcb);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs))
setup_singlestep(p, regs, kcb, 0);
else
reset_current_kprobe();
return 1;
}
}
static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
{
...
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
/* single step inline if the instruction is an int3 */
if (p->opcode == BREAKPOINT_INSTRUCTION)
regs->ip = (unsigned long)p->addr;
else
regs->ip = (unsigned long)p->ainsn.insn;
...
}
- post_handler
在执行被探测函数之后调用,在
do_debug
中执行。
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
...
#ifdef CONFIG_KPROBES
if (kprobe_debug_handler(regs))
goto exit;
#endif
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
goto exit;
...
}
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
* remain disabled throughout this function.
*/
int kprobe_debug_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
...
if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;
cur->post_handler(cur, regs, 0);
}
...
}
黑名单
标记
标记函数不被probe的步骤:
- include linux/kprobes.h
- use NOKPROBE_SYMBOL() macro
使用宏 NOKPROBE_SYMBOL 标记函数为_kprobe_blacklist section,该函数地址不被probe。
/*
* Blacklist ganerating macro. Specify functions which is not probed
* by using this macro.
*/
# define __NOKPROBE_SYMBOL(fname) \
static unsigned long __used \
__attribute__((__section__("_kprobe_blacklist"))) \
_kbl_addr_##fname = (unsigned long)fname;
# define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname)
/* Use this to forbid a kprobes attach on very low level functions */
# define __kprobes __attribute__((__section__(".kprobes.text")))
# define nokprobe_inline __always_inline
初始化
在kprobe 初始化的时候 init_kprobes 遍历 _kprobe_blacklist section 区域,加入到全局链表中 kprobe_blacklist
struct kprobe_blacklist_entry {
struct list_head list;
unsigned long start_addr;
unsigned long end_addr;
};
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
debugfs 接口
目录下 /sys/kernel/debug/kprobes/
- blacklist 显示不被probe的函数地址,
显示格式为
起始地址-结束地址 符号表名
如果
/proc/kallsyms
不显示地址,则也不显示地址
cat /sys/kernel/debug/kprobes/blacklist
0xffffffff8b400920-0xffffffff8b4009e8 interrupt_entry
0xffffffff8b4009e8-0xffffffff8b400a00 common_spurious
0xffffffff8b400a00-0xffffffff8b400a0f common_interrupt
0xffffffff8b400bb0-0xffffffff8b400bd0 irq_move_cleanup_interrupt
0xffffffff8b400bd0-0xffffffff8b400bf0 reboot_interrupt
0xffffffff8b400bf0-0xffffffff8b400c10 uv_bau_message_intr1
0xffffffff8b401c60-0xffffffff8b401c80 apic_timer_interrupt
enabled 启用\停用probe 有效的值为Y、y、1或者N、n、0
list 显示probe详细信息 显示格式同样分为两种格式 1.地址 probe类型 函数名+偏移量 模块名 状态 2.地址 probe类型 地址 状态
probe类型: k代表kprobe r表示为kretprobe
状态有:
GONE DISABLED OPTIMIZED FTRACE
/* Kprobe status flags */
#define KPROBE_FLAG_GONE 1 /* breakpoint has already gone */
#define KPROBE_FLAG_DISABLED 2 /* probe is temporarily disabled */
#define KPROBE_FLAG_OPTIMIZED 4 /*
* probe is really optimized.
* NOTE:
* this flag is only for optimized_kprobe.
*/
#define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */
/* Has this kprobe gone ? */
static inline int kprobe_gone(struct kprobe *p)
{
return p->flags & KPROBE_FLAG_GONE;
}
/* Is this kprobe disabled ? */
static inline int kprobe_disabled(struct kprobe *p)
{
return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
}
/* Is this kprobe really running optimized path ? */
static inline int kprobe_optimized(struct kprobe *p)
{
return p->flags & KPROBE_FLAG_OPTIMIZED;
}
/* Is this kprobe uses ftrace ? */
static inline int kprobe_ftrace(struct kprobe *p)
{
return p->flags & KPROBE_FLAG_FTRACE;
}
list 输出函数
static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
void *addr = p->addr;
if (p->pre_handler == pre_handler_kretprobe)
kprobe_type = "r";
else
kprobe_type = "k";
if (!kallsyms_show_value(pi->file->f_cred))
addr = NULL;
if (sym)
seq_printf(pi, "%px %s %s+0x%x %s ",
addr, kprobe_type, sym, offset,
(modname ? modname : " "));
else /* try to use %pS */
seq_printf(pi, "%px %s %pS ",
addr, kprobe_type, p->addr);
if (!pp)
pp = p;
seq_printf(pi, "%s%s%s%s\n",
(kprobe_gone(p) ? "[GONE]" : ""),
((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
(kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
(kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
kprobes sysctl 接口
/proc/sys/debug/kprobes-optimization kprobe优化开关,默认优化开启。
static struct ctl_table debug_table[] = {
#if defined(CONFIG_OPTPROBES)
{
.procname = "kprobes-optimization",
.data = &sysctl_kprobes_optimization,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_kprobes_optimization_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif
}
优化
优化是有一个内核工作线程完成的。除了开关控制之后,在注册kprobe的时候会尝试优化,默认是优化处理的。
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5
不是所有的probe都可以优化,带post_handler的kprobe不能被优化。
kprobes-optimization 开关使能之后,遍历所有可以优化的kprobe加入到可优化的链表optimizing_list中,调用关系如下。
proc_kprobes_optimization_handler ---> optimize_all_kprobes ---> optimize_kprobe
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
...
/* kprobes with post_handler can not be optimized */
if (p->post_handler)
return;
op = container_of(p, struct optimized_kprobe, kp);
...
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
/* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
if (WARN_ON_ONCE(!list_empty(&op->list)))
return;
list_add(&op->list, &optimizing_list);
kick_kprobe_optimizer();
}
在optimize_kprobe 函数中把可以优化的kprobe加入到优化列表optimizing_list,然后唤醒工作队列。
在工作队列的的处理函数中,把init3指令替换成真实的jump地址。
/*
* Replace breakpoints (int3) with relative jumps.
* Caller must call with locking kprobe_mutex and text_mutex.
*/
void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
u8 insn_buff[RELATIVEJUMP_SIZE];
list_for_each_entry_safe(op, tmp, oplist, list) {
s32 rel = (s32)((long)op->optinsn.insn -
((long)op->kp.addr + RELATIVEJUMP_SIZE));
WARN_ON(kprobe_disabled(&op->kp));
/* Backup instructions which will be replaced by jump address */
memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
RELATIVE_ADDR_SIZE);
insn_buff[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
op->optinsn.insn);
list_del_init(&op->list);
}
}
实验
探测ping包的发送。
基于 samples/kprobes/kprobe_example.c 修改如下
static char symbol[MAX_SYMBOL_LEN] = "ping_v4_sendmsg";
module_param_string(symbol, symbol, sizeof(symbol), 0644);
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = symbol,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
pr_info("<%s> pre_handler: p->addr = 0x%p, start ping\n",
p->symbol_name, p->addr);
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
#ifdef CONFIG_X86
pr_info("<%s> post_handler: p->addr = 0x%p, ping done\n",
p->symbol_name, p->addr);
}
编译成ko,加载到内核。
执行ping
PING github.com (20.205.243.166) 56(84) bytes of data.
64 bytes from 20.205.243.166 (20.205.243.166): icmp_seq=1 ttl=128 time=113 ms
64 bytes from 20.205.243.166 (20.205.243.166): icmp_seq=2 ttl=128 time=112 ms
^C
--- github.com ping statistics ---
dmesg信息
[94402.119451] <ping_v4_sendmsg> pre_handler: p->addr = 0x00000000dd88dbce, start ping
[94402.119452] <ping_v4_sendmsg> post_handler: p->addr = 0x00000000dd88dbce, ping done
[94403.129446] <ping_v4_sendmsg> pre_handler: p->addr = 0x00000000dd88dbce, start ping
[94403.129450] <ping_v4_sendmsg> post_handler: p->addr = 0x00000000dd88dbce, ping done
参考
https://lwn.net/Articles/132196/ https://www.kernel.org/doc/Documentation/kprobes.txt
欢迎大家转发分享。未经授权,严禁任何复制、转载、摘编或以其它方式进行使用,转载须注明来自eBPFLAB并附上本文链接。如果有侵犯到您权益的地方,请及时联系我删除。