可观测性 kprobe


介绍

kprobe是一种动态调试机制,能够动态地中断内核处理流程并无中断地收集调试和性能信息,可以在内核的绝大多数指定函数中动态插入探测点来收集所需的调试状态信息而基本不影响内核原有的执行流程,kprobe不用修改内核源码,是指令集的探测技术。

有三种方式来达到kprobe的目的。

  1. 编写内核模块,增加kprobe hook。
  2. 使用ftrace 监控kprobe event。
  3. 使用ebpf 增加kprobe hook。

本篇只讲第一种。

工作原理

工作机制如图所示,

  1. 当注册一个kprobe的时候,把addr位置的指令修改为brk指令。

  2. CPU执行到addr处trap到pre_handler执行。

  3. pre_handler执行完毕后产生debug异常执行post_handler。

  4. CPU继续执行addr处的指令。

kprobe的管理

初始化

init_kprobes初始化kprobe hash表kprobe_table;采集黑名单 kretprobe_blacklist; 异常notifier 以及模块notifier。

static int __init init_kprobes(void)
{
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		INIT_HLIST_HEAD(&kprobe_table[i]);
		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
		raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
	}

	err = populate_kprobe_blacklist(__start_kprobe_blacklist,
					__stop_kprobe_blacklist);
...						
...
	err = arch_init_kprobes();
	if (!err)
		err = register_die_notifier(&kprobe_exceptions_nb);
	if (!err)
		err = register_module_notifier(&kprobe_module_nb);
...
}

kprobe 实例存放在全局的hash表中,以函数的地址做hash值。

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)

static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];

struct kprobe {
	struct hlist_node hlist;

	/* list of kprobes for multi-handler support */
	struct list_head list;

	/*count the number of times this probe was temporarily disarmed */
	unsigned long nmissed;

	/* location of the probe point */
	kprobe_opcode_t *addr;

	/* Allow user to indicate symbol name of the probe point */
	const char *symbol_name;

	/* Offset into the symbol */
	unsigned int offset;

	/* Called before addr is executed. */
	kprobe_pre_handler_t pre_handler;

	/* Called after addr is executed, unless... */
	kprobe_post_handler_t post_handler;

	/*
	 * ... called if executing addr causes a fault (eg. page fault).
	 * Return 1 if it handled fault, otherwise kernel will see it.
	 */
	kprobe_fault_handler_t fault_handler;

	/* Saved opcode (which has been replaced with breakpoint) */
	kprobe_opcode_t opcode;

	/* copy of the original instruction */
	struct arch_specific_insn ainsn;

	/*
	 * Indicates various status flags.
	 * Protected by kprobe_mutex after this kprobe is registered.
	 */
	u32 flags;
}

动态的注册一个 kprobe, 加入到 kprobe_table hash表中,把要采集的函数地址addr替换为brk指令。

int register_kprobe(struct kprobe *p)
{
    ...
	INIT_HLIST_NODE(&p->hlist);
	hlist_add_head_rcu(&p->hlist,
		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
	if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
		ret = arm_kprobe(p);
		if (ret) {
			hlist_del_rcu(&p->hlist);
			synchronize_rcu();
			goto out;
		}
	}
    ...
}

kprobe的处理是分体系结构的,不同的架构指令值不同。以x86为例子,在arch_arm_kprobe把addr替换成brk。

/arch/x86/include/asm/kprobes.h
#define BREAKPOINT_INSTRUCTION	0xcc

arm_kprobe ---> arm_kprobe ----> arch_arm_kprobe

void arch_arm_kprobe(struct kprobe *p)
{
	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}

操作API

注册/去注册

int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);

使能/去使能

int enable_kprobe(struct kprobe *kp);
int disable_kprobe(struct kprobe *kp);

获取kprobe

/* Get the kprobe at this addr (if any) - called with preemption disabled */
struct kprobe *get_kprobe(void *addr);
/* kprobe_running() will just return the current_kprobe on this CPU */
static inline struct kprobe *kprobe_running(void)
{
	return (__this_cpu_read(current_kprobe));
}

hook点

hook的选取原则

内核函数很多,但是并不是所有的函数都适合做hook,inline函数无法作为hook、static函数有可能会被优化。所以hook一般会在 /proc/kallsyms 中选取。

hook的执行

When a kprobe is registered, Kprobes makes a copy of the probed instruction and replaces the first byte(s) of the probed instruction with a breakpoint instruction (e.g., int3 on i386 and x86_64).

注册 kprobe 后,Kprobes 会复制被探测的指令,并用断点指令(例如 i386 和 x86_64 上的 int3)替换被探测指令的第一个字节。

When a CPU hits the breakpoint instruction, a trap occurs, the CPU’s registers are saved, and control passes to Kprobes via the notifier_call_chain mechanism. Kprobes executes the “pre_handler” associated with the kprobe, passing the handler the addresses of the kprobe struct and the saved registers.

当 CPU 遇到断点指令时,会发生陷阱,保存 CPU 的寄存器,并通过 notifier_call_chain 机制将控制权传递给 Kprobes。 Kprobes 在函数 kprobe_handler中 执行与 kprobe 相关的“pre_handler”,将 kprobe 结构的地址和保存的寄存器传递给处理程序。 对于x86、arm64体系架构 notifier_call_chain 通知链回调函数并没有实际的作用了。

Next, Kprobes single-steps its copy of the probed instruction. (It would be simpler to single-step the actual instruction in place, but then Kprobes would have to temporarily remove the breakpoint instruction. This would open a small time window when another CPU could sail right past the probepoint.)

接下来,Kprobes 单步执行其探测指令的副本。(单步执行实际指令会更简单,但 Kprobes 将不得不暂时删除断点指令,这将打开一个小的时间窗口,此时另一个 CPU 可以直接越过探测点。)

After the instruction is single-stepped, Kprobes executes the “post_handler,” if any, that is associated with the kprobe. Execution then continues with the instruction following the probepoint.

在指令单步执行后,会产生debug异常,Kprobes 执行与 kprobe 关联的“post_handler”(如果有), 然后继续执行探测点之后的指令。

  1. pre_handler 在执行被探测函数之前调用,在do_int3中执行。
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
	...
#ifdef CONFIG_KPROBES
	if (kprobe_int3_handler(regs))
		goto exit;
#endif

	if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
			SIGTRAP) == NOTIFY_STOP)
		goto exit;

	cond_local_irq_enable(regs);
	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
	cond_local_irq_disable(regs);	
	...
}

/*
 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
 * remain disabled throughout this function.
 */
int kprobe_int3_handler(struct pt_regs *regs)
{
	...
	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
	/*
	 * We don't want to be preempted for the entire duration of kprobe
	 * processing. Since int3 and debug trap disables irqs and we clear
	 * IF while singlestepping, it must be no preemptible.
	 */

	kcb = get_kprobe_ctlblk();
	p = get_kprobe(addr);
	if (p) {
		...
			set_current_kprobe(p, regs, kcb);
			kcb->kprobe_status = KPROBE_HIT_ACTIVE;

			if (!p->pre_handler || !p->pre_handler(p, regs))
				setup_singlestep(p, regs, kcb, 0);
			else
				reset_current_kprobe();
			return 1;
	}	
}

static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
{
	...
		regs->flags |= X86_EFLAGS_TF;
	regs->flags &= ~X86_EFLAGS_IF;
	/* single step inline if the instruction is an int3 */
	if (p->opcode == BREAKPOINT_INSTRUCTION)
		regs->ip = (unsigned long)p->addr;
	else
		regs->ip = (unsigned long)p->ainsn.insn;
	...
}
  1. post_handler 在执行被探测函数之后调用,在do_debug中执行。
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
	...
#ifdef CONFIG_KPROBES
	if (kprobe_debug_handler(regs))
		goto exit;
#endif

	if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
							SIGTRAP) == NOTIFY_STOP)
		goto exit;	
	...
}

/*
 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
 * remain disabled throughout this function.
 */
int kprobe_debug_handler(struct pt_regs *regs)
{
	struct kprobe *cur = kprobe_running();
	...
	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
		kcb->kprobe_status = KPROBE_HIT_SSDONE;
		cur->post_handler(cur, regs, 0);
	}
	...
}

黑名单

标记

标记函数不被probe的步骤:

  1. include linux/kprobes.h
  2. use NOKPROBE_SYMBOL() macro

使用宏 NOKPROBE_SYMBOL 标记函数为_kprobe_blacklist section,该函数地址不被probe。

/*
 * Blacklist ganerating macro. Specify functions which is not probed
 * by using this macro.
 */
# define __NOKPROBE_SYMBOL(fname)				\
static unsigned long __used					\
	__attribute__((__section__("_kprobe_blacklist")))	\
	_kbl_addr_##fname = (unsigned long)fname;
# define NOKPROBE_SYMBOL(fname)	__NOKPROBE_SYMBOL(fname)
/* Use this to forbid a kprobes attach on very low level functions */
# define __kprobes	__attribute__((__section__(".kprobes.text")))
# define nokprobe_inline	__always_inline

初始化

在kprobe 初始化的时候 init_kprobes 遍历 _kprobe_blacklist section 区域,加入到全局链表中 kprobe_blacklist

struct kprobe_blacklist_entry {
	struct list_head list;
	unsigned long start_addr;
	unsigned long end_addr;
};

/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);

debugfs 接口

目录下 /sys/kernel/debug/kprobes/

  1. blacklist 显示不被probe的函数地址, 显示格式为 起始地址-结束地址 符号表名 如果/proc/kallsyms 不显示地址,则也不显示地址
cat /sys/kernel/debug/kprobes/blacklist
0xffffffff8b400920-0xffffffff8b4009e8   interrupt_entry
0xffffffff8b4009e8-0xffffffff8b400a00   common_spurious
0xffffffff8b400a00-0xffffffff8b400a0f   common_interrupt
0xffffffff8b400bb0-0xffffffff8b400bd0   irq_move_cleanup_interrupt
0xffffffff8b400bd0-0xffffffff8b400bf0   reboot_interrupt
0xffffffff8b400bf0-0xffffffff8b400c10   uv_bau_message_intr1
0xffffffff8b401c60-0xffffffff8b401c80   apic_timer_interrupt
  1. enabled 启用\停用probe 有效的值为Y、y、1或者N、n、0

  2. list 显示probe详细信息 显示格式同样分为两种格式 1.地址 probe类型 函数名+偏移量 模块名 状态 2.地址 probe类型 地址 状态

probe类型: k代表kprobe r表示为kretprobe

状态有: GONE DISABLED OPTIMIZED FTRACE

/* Kprobe status flags */
#define KPROBE_FLAG_GONE	1 /* breakpoint has already gone */
#define KPROBE_FLAG_DISABLED	2 /* probe is temporarily disabled */
#define KPROBE_FLAG_OPTIMIZED	4 /*
				   * probe is really optimized.
				   * NOTE:
				   * this flag is only for optimized_kprobe.
				   */
#define KPROBE_FLAG_FTRACE	8 /* probe is using ftrace */

/* Has this kprobe gone ? */
static inline int kprobe_gone(struct kprobe *p)
{
	return p->flags & KPROBE_FLAG_GONE;
}

/* Is this kprobe disabled ? */
static inline int kprobe_disabled(struct kprobe *p)
{
	return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
}

/* Is this kprobe really running optimized path ? */
static inline int kprobe_optimized(struct kprobe *p)
{
	return p->flags & KPROBE_FLAG_OPTIMIZED;
}

/* Is this kprobe uses ftrace ? */
static inline int kprobe_ftrace(struct kprobe *p)
{
	return p->flags & KPROBE_FLAG_FTRACE;
}

list 输出函数

static void report_probe(struct seq_file *pi, struct kprobe *p,
		const char *sym, int offset, char *modname, struct kprobe *pp)
{
	char *kprobe_type;
	void *addr = p->addr;

	if (p->pre_handler == pre_handler_kretprobe)
		kprobe_type = "r";
	else
		kprobe_type = "k";

	if (!kallsyms_show_value(pi->file->f_cred))
		addr = NULL;

	if (sym)
		seq_printf(pi, "%px  %s  %s+0x%x  %s ",
			addr, kprobe_type, sym, offset,
			(modname ? modname : " "));
	else	/* try to use %pS */
		seq_printf(pi, "%px  %s  %pS ",
			addr, kprobe_type, p->addr);

	if (!pp)
		pp = p;
	seq_printf(pi, "%s%s%s%s\n",
		(kprobe_gone(p) ? "[GONE]" : ""),
		((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
		(kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
		(kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}

kprobes sysctl 接口

/proc/sys/debug/kprobes-optimization kprobe优化开关,默认优化开启。

static struct ctl_table debug_table[] = {
#if defined(CONFIG_OPTPROBES)
	{
		.procname	= "kprobes-optimization",
		.data		= &sysctl_kprobes_optimization,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_kprobes_optimization_handler,
		.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_ONE,
	},
#endif
}

优化

优化是有一个内核工作线程完成的。除了开关控制之后,在注册kprobe的时候会尝试优化,默认是优化处理的。

/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);

static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5

不是所有的probe都可以优化,带post_handler的kprobe不能被优化。

kprobes-optimization 开关使能之后,遍历所有可以优化的kprobe加入到可优化的链表optimizing_list中,调用关系如下。

proc_kprobes_optimization_handler ---> optimize_all_kprobes ---> optimize_kprobe

/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
	struct optimized_kprobe *op;

...

	/* kprobes with post_handler can not be optimized */
	if (p->post_handler)
		return;

	op = container_of(p, struct optimized_kprobe, kp);

...
	op->kp.flags |= KPROBE_FLAG_OPTIMIZED;

	/* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
	if (WARN_ON_ONCE(!list_empty(&op->list)))
		return;

	list_add(&op->list, &optimizing_list);
	kick_kprobe_optimizer();
}

在optimize_kprobe 函数中把可以优化的kprobe加入到优化列表optimizing_list,然后唤醒工作队列。

在工作队列的的处理函数中,把init3指令替换成真实的jump地址。

/*
 * Replace breakpoints (int3) with relative jumps.
 * Caller must call with locking kprobe_mutex and text_mutex.
 */
void arch_optimize_kprobes(struct list_head *oplist)
{
	struct optimized_kprobe *op, *tmp;
	u8 insn_buff[RELATIVEJUMP_SIZE];

	list_for_each_entry_safe(op, tmp, oplist, list) {
		s32 rel = (s32)((long)op->optinsn.insn -
			((long)op->kp.addr + RELATIVEJUMP_SIZE));

		WARN_ON(kprobe_disabled(&op->kp));

		/* Backup instructions which will be replaced by jump address */
		memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
		       RELATIVE_ADDR_SIZE);

		insn_buff[0] = RELATIVEJUMP_OPCODE;
		*(s32 *)(&insn_buff[1]) = rel;

		text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
			     op->optinsn.insn);

		list_del_init(&op->list);
	}
}

实验

探测ping包的发送。

基于 samples/kprobes/kprobe_example.c 修改如下

static char symbol[MAX_SYMBOL_LEN] = "ping_v4_sendmsg";
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
        .symbol_name    = symbol,
};

/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
        pr_info("<%s> pre_handler: p->addr = 0x%p, start ping\n",
                p->symbol_name, p->addr);
}

/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
                                unsigned long flags)
{
#ifdef CONFIG_X86
        pr_info("<%s> post_handler: p->addr = 0x%p, ping done\n",
                p->symbol_name, p->addr);
}

编译成ko,加载到内核。

执行ping

PING github.com (20.205.243.166) 56(84) bytes of data.
64 bytes from 20.205.243.166 (20.205.243.166): icmp_seq=1 ttl=128 time=113 ms
64 bytes from 20.205.243.166 (20.205.243.166): icmp_seq=2 ttl=128 time=112 ms
^C
--- github.com ping statistics ---

dmesg信息

[94402.119451] <ping_v4_sendmsg> pre_handler: p->addr = 0x00000000dd88dbce, start ping
[94402.119452] <ping_v4_sendmsg> post_handler: p->addr = 0x00000000dd88dbce, ping done
[94403.129446] <ping_v4_sendmsg> pre_handler: p->addr = 0x00000000dd88dbce, start ping
[94403.129450] <ping_v4_sendmsg> post_handler: p->addr = 0x00000000dd88dbce, ping done

参考

https://lwn.net/Articles/132196/ https://www.kernel.org/doc/Documentation/kprobes.txt

欢迎大家转发分享。未经授权,严禁任何复制、转载、摘编或以其它方式进行使用,转载须注明来自eBPFLAB并附上本文链接。如果有侵犯到您权益的地方,请及时联系我删除。