| rjw | 1f88458 | 2022-01-06 17:20:42 +0800 | [diff] [blame] | 1 | Kernel level exception handling in Linux | 
|  | 2 | Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com> | 
|  | 3 |  | 
|  | 4 | When a process runs in kernel mode, it often has to access user | 
|  | 5 | mode memory whose address has been passed by an untrusted program. | 
|  | 6 | To protect itself the kernel has to verify this address. | 
|  | 7 |  | 
|  | 8 | In older versions of Linux this was done with the | 
|  | 9 | int verify_area(int type, const void * addr, unsigned long size) | 
|  | 10 | function (which has since been replaced by access_ok()). | 
|  | 11 |  | 
|  | 12 | This function verified that the memory area starting at address | 
|  | 13 | 'addr' and of size 'size' was accessible for the operation specified | 
|  | 14 | in type (read or write). To do this, verify_read had to look up the | 
|  | 15 | virtual memory area (vma) that contained the address addr. In the | 
|  | 16 | normal case (correctly working program), this test was successful. | 
|  | 17 | It only failed for a few buggy programs. In some kernel profiling | 
|  | 18 | tests, this normally unneeded verification used up a considerable | 
|  | 19 | amount of time. | 
|  | 20 |  | 
|  | 21 | To overcome this situation, Linus decided to let the virtual memory | 
|  | 22 | hardware present in every Linux-capable CPU handle this test. | 
|  | 23 |  | 
|  | 24 | How does this work? | 
|  | 25 |  | 
|  | 26 | Whenever the kernel tries to access an address that is currently not | 
|  | 27 | accessible, the CPU generates a page fault exception and calls the | 
|  | 28 | page fault handler | 
|  | 29 |  | 
|  | 30 | void do_page_fault(struct pt_regs *regs, unsigned long error_code) | 
|  | 31 |  | 
|  | 32 | in arch/x86/mm/fault.c. The parameters on the stack are set up by | 
|  | 33 | the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter | 
|  | 34 | regs is a pointer to the saved registers on the stack, error_code | 
|  | 35 | contains a reason code for the exception. | 
|  | 36 |  | 
|  | 37 | do_page_fault first obtains the unaccessible address from the CPU | 
|  | 38 | control register CR2. If the address is within the virtual address | 
|  | 39 | space of the process, the fault probably occurred, because the page | 
|  | 40 | was not swapped in, write protected or something similar. However, | 
|  | 41 | we are interested in the other case: the address is not valid, there | 
|  | 42 | is no vma that contains this address. In this case, the kernel jumps | 
|  | 43 | to the bad_area label. | 
|  | 44 |  | 
|  | 45 | There it uses the address of the instruction that caused the exception | 
|  | 46 | (i.e. regs->eip) to find an address where the execution can continue | 
|  | 47 | (fixup). If this search is successful, the fault handler modifies the | 
|  | 48 | return address (again regs->eip) and returns. The execution will | 
|  | 49 | continue at the address in fixup. | 
|  | 50 |  | 
|  | 51 | Where does fixup point to? | 
|  | 52 |  | 
|  | 53 | Since we jump to the contents of fixup, fixup obviously points | 
|  | 54 | to executable code. This code is hidden inside the user access macros. | 
|  | 55 | I have picked the get_user macro defined in arch/x86/include/asm/uaccess.h | 
|  | 56 | as an example. The definition is somewhat hard to follow, so let's peek at | 
|  | 57 | the code generated by the preprocessor and the compiler. I selected | 
|  | 58 | the get_user call in drivers/char/sysrq.c for a detailed examination. | 
|  | 59 |  | 
|  | 60 | The original code in sysrq.c line 587: | 
|  | 61 | get_user(c, buf); | 
|  | 62 |  | 
|  | 63 | The preprocessor output (edited to become somewhat readable): | 
|  | 64 |  | 
|  | 65 | ( | 
|  | 66 | { | 
|  | 67 | long __gu_err = - 14 , __gu_val = 0; | 
|  | 68 | const __typeof__(*( (  buf ) )) *__gu_addr = ((buf)); | 
|  | 69 | if (((((0 + current_set[0])->tss.segment) == 0x18 )  || | 
|  | 70 | (((sizeof(*(buf))) <= 0xC0000000UL) && | 
|  | 71 | ((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf))))))) | 
|  | 72 | do { | 
|  | 73 | __gu_err  = 0; | 
|  | 74 | switch ((sizeof(*(buf)))) { | 
|  | 75 | case 1: | 
|  | 76 | __asm__ __volatile__( | 
|  | 77 | "1:      mov" "b" " %2,%" "b" "1\n" | 
|  | 78 | "2:\n" | 
|  | 79 | ".section .fixup,\"ax\"\n" | 
|  | 80 | "3:      movl %3,%0\n" | 
|  | 81 | "        xor" "b" " %" "b" "1,%" "b" "1\n" | 
|  | 82 | "        jmp 2b\n" | 
|  | 83 | ".section __ex_table,\"a\"\n" | 
|  | 84 | "        .align 4\n" | 
|  | 85 | "        .long 1b,3b\n" | 
|  | 86 | ".text"        : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *) | 
|  | 87 | (   __gu_addr   )) ), "i"(- 14 ), "0"(  __gu_err  )) ; | 
|  | 88 | break; | 
|  | 89 | case 2: | 
|  | 90 | __asm__ __volatile__( | 
|  | 91 | "1:      mov" "w" " %2,%" "w" "1\n" | 
|  | 92 | "2:\n" | 
|  | 93 | ".section .fixup,\"ax\"\n" | 
|  | 94 | "3:      movl %3,%0\n" | 
|  | 95 | "        xor" "w" " %" "w" "1,%" "w" "1\n" | 
|  | 96 | "        jmp 2b\n" | 
|  | 97 | ".section __ex_table,\"a\"\n" | 
|  | 98 | "        .align 4\n" | 
|  | 99 | "        .long 1b,3b\n" | 
|  | 100 | ".text"        : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) | 
|  | 101 | (   __gu_addr   )) ), "i"(- 14 ), "0"(  __gu_err  )); | 
|  | 102 | break; | 
|  | 103 | case 4: | 
|  | 104 | __asm__ __volatile__( | 
|  | 105 | "1:      mov" "l" " %2,%" "" "1\n" | 
|  | 106 | "2:\n" | 
|  | 107 | ".section .fixup,\"ax\"\n" | 
|  | 108 | "3:      movl %3,%0\n" | 
|  | 109 | "        xor" "l" " %" "" "1,%" "" "1\n" | 
|  | 110 | "        jmp 2b\n" | 
|  | 111 | ".section __ex_table,\"a\"\n" | 
|  | 112 | "        .align 4\n"        "        .long 1b,3b\n" | 
|  | 113 | ".text"        : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) | 
|  | 114 | (   __gu_addr   )) ), "i"(- 14 ), "0"(__gu_err)); | 
|  | 115 | break; | 
|  | 116 | default: | 
|  | 117 | (__gu_val) = __get_user_bad(); | 
|  | 118 | } | 
|  | 119 | } while (0) ; | 
|  | 120 | ((c)) = (__typeof__(*((buf))))__gu_val; | 
|  | 121 | __gu_err; | 
|  | 122 | } | 
|  | 123 | ); | 
|  | 124 |  | 
|  | 125 | WOW! Black GCC/assembly magic. This is impossible to follow, so let's | 
|  | 126 | see what code gcc generates: | 
|  | 127 |  | 
|  | 128 | >         xorl %edx,%edx | 
|  | 129 | >         movl current_set,%eax | 
|  | 130 | >         cmpl $24,788(%eax) | 
|  | 131 | >         je .L1424 | 
|  | 132 | >         cmpl $-1073741825,64(%esp) | 
|  | 133 | >         ja .L1423 | 
|  | 134 | > .L1424: | 
|  | 135 | >         movl %edx,%eax | 
|  | 136 | >         movl 64(%esp),%ebx | 
|  | 137 | > #APP | 
|  | 138 | > 1:      movb (%ebx),%dl                /* this is the actual user access */ | 
|  | 139 | > 2: | 
|  | 140 | > .section .fixup,"ax" | 
|  | 141 | > 3:      movl $-14,%eax | 
|  | 142 | >         xorb %dl,%dl | 
|  | 143 | >         jmp 2b | 
|  | 144 | > .section __ex_table,"a" | 
|  | 145 | >         .align 4 | 
|  | 146 | >         .long 1b,3b | 
|  | 147 | > .text | 
|  | 148 | > #NO_APP | 
|  | 149 | > .L1423: | 
|  | 150 | >         movzbl %dl,%esi | 
|  | 151 |  | 
|  | 152 | The optimizer does a good job and gives us something we can actually | 
|  | 153 | understand. Can we? The actual user access is quite obvious. Thanks | 
|  | 154 | to the unified address space we can just access the address in user | 
|  | 155 | memory. But what does the .section stuff do????? | 
|  | 156 |  | 
|  | 157 | To understand this we have to look at the final kernel: | 
|  | 158 |  | 
|  | 159 | > objdump --section-headers vmlinux | 
|  | 160 | > | 
|  | 161 | > vmlinux:     file format elf32-i386 | 
|  | 162 | > | 
|  | 163 | > Sections: | 
|  | 164 | > Idx Name          Size      VMA       LMA       File off  Algn | 
|  | 165 | >   0 .text         00098f40  c0100000  c0100000  00001000  2**4 | 
|  | 166 | >                   CONTENTS, ALLOC, LOAD, READONLY, CODE | 
|  | 167 | >   1 .fixup        000016bc  c0198f40  c0198f40  00099f40  2**0 | 
|  | 168 | >                   CONTENTS, ALLOC, LOAD, READONLY, CODE | 
|  | 169 | >   2 .rodata       0000f127  c019a5fc  c019a5fc  0009b5fc  2**2 | 
|  | 170 | >                   CONTENTS, ALLOC, LOAD, READONLY, DATA | 
|  | 171 | >   3 __ex_table    000015c0  c01a9724  c01a9724  000aa724  2**2 | 
|  | 172 | >                   CONTENTS, ALLOC, LOAD, READONLY, DATA | 
|  | 173 | >   4 .data         0000ea58  c01abcf0  c01abcf0  000abcf0  2**4 | 
|  | 174 | >                   CONTENTS, ALLOC, LOAD, DATA | 
|  | 175 | >   5 .bss          00018e21  c01ba748  c01ba748  000ba748  2**2 | 
|  | 176 | >                   ALLOC | 
|  | 177 | >   6 .comment      00000ec4  00000000  00000000  000ba748  2**0 | 
|  | 178 | >                   CONTENTS, READONLY | 
|  | 179 | >   7 .note         00001068  00000ec4  00000ec4  000bb60c  2**0 | 
|  | 180 | >                   CONTENTS, READONLY | 
|  | 181 |  | 
|  | 182 | There are obviously 2 non standard ELF sections in the generated object | 
|  | 183 | file. But first we want to find out what happened to our code in the | 
|  | 184 | final kernel executable: | 
|  | 185 |  | 
|  | 186 | > objdump --disassemble --section=.text vmlinux | 
|  | 187 | > | 
|  | 188 | > c017e785 <do_con_write+c1> xorl   %edx,%edx | 
|  | 189 | > c017e787 <do_con_write+c3> movl   0xc01c7bec,%eax | 
|  | 190 | > c017e78c <do_con_write+c8> cmpl   $0x18,0x314(%eax) | 
|  | 191 | > c017e793 <do_con_write+cf> je     c017e79f <do_con_write+db> | 
|  | 192 | > c017e795 <do_con_write+d1> cmpl   $0xbfffffff,0x40(%esp,1) | 
|  | 193 | > c017e79d <do_con_write+d9> ja     c017e7a7 <do_con_write+e3> | 
|  | 194 | > c017e79f <do_con_write+db> movl   %edx,%eax | 
|  | 195 | > c017e7a1 <do_con_write+dd> movl   0x40(%esp,1),%ebx | 
|  | 196 | > c017e7a5 <do_con_write+e1> movb   (%ebx),%dl | 
|  | 197 | > c017e7a7 <do_con_write+e3> movzbl %dl,%esi | 
|  | 198 |  | 
|  | 199 | The whole user memory access is reduced to 10 x86 machine instructions. | 
|  | 200 | The instructions bracketed in the .section directives are no longer | 
|  | 201 | in the normal execution path. They are located in a different section | 
|  | 202 | of the executable file: | 
|  | 203 |  | 
|  | 204 | > objdump --disassemble --section=.fixup vmlinux | 
|  | 205 | > | 
|  | 206 | > c0199ff5 <.fixup+10b5> movl   $0xfffffff2,%eax | 
|  | 207 | > c0199ffa <.fixup+10ba> xorb   %dl,%dl | 
|  | 208 | > c0199ffc <.fixup+10bc> jmp    c017e7a7 <do_con_write+e3> | 
|  | 209 |  | 
|  | 210 | And finally: | 
|  | 211 | > objdump --full-contents --section=__ex_table vmlinux | 
|  | 212 | > | 
|  | 213 | >  c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0  ................ | 
|  | 214 | >  c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0  ................ | 
|  | 215 | >  c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0  ................ | 
|  | 216 |  | 
|  | 217 | or in human readable byte order: | 
|  | 218 |  | 
|  | 219 | >  c01aa7c4 c017c093 c0199fe0 c017c097 c017c099  ................ | 
|  | 220 | >  c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5  ................ | 
|  | 221 | ^^^^^^^^^^^^^^^^^ | 
|  | 222 | this is the interesting part! | 
|  | 223 | >  c01aa7e4 c0180a08 c019a001 c0180a0a c019a004  ................ | 
|  | 224 |  | 
|  | 225 | What happened? The assembly directives | 
|  | 226 |  | 
|  | 227 | .section .fixup,"ax" | 
|  | 228 | .section __ex_table,"a" | 
|  | 229 |  | 
|  | 230 | told the assembler to move the following code to the specified | 
|  | 231 | sections in the ELF object file. So the instructions | 
|  | 232 | 3:      movl $-14,%eax | 
|  | 233 | xorb %dl,%dl | 
|  | 234 | jmp 2b | 
|  | 235 | ended up in the .fixup section of the object file and the addresses | 
|  | 236 | .long 1b,3b | 
|  | 237 | ended up in the __ex_table section of the object file. 1b and 3b | 
|  | 238 | are local labels. The local label 1b (1b stands for next label 1 | 
|  | 239 | backward) is the address of the instruction that might fault, i.e. | 
|  | 240 | in our case the address of the label 1 is c017e7a5: | 
|  | 241 | the original assembly code: > 1:      movb (%ebx),%dl | 
|  | 242 | and linked in vmlinux     : > c017e7a5 <do_con_write+e1> movb   (%ebx),%dl | 
|  | 243 |  | 
|  | 244 | The local label 3 (backwards again) is the address of the code to handle | 
|  | 245 | the fault, in our case the actual value is c0199ff5: | 
|  | 246 | the original assembly code: > 3:      movl $-14,%eax | 
|  | 247 | and linked in vmlinux     : > c0199ff5 <.fixup+10b5> movl   $0xfffffff2,%eax | 
|  | 248 |  | 
|  | 249 | The assembly code | 
|  | 250 | > .section __ex_table,"a" | 
|  | 251 | >         .align 4 | 
|  | 252 | >         .long 1b,3b | 
|  | 253 |  | 
|  | 254 | becomes the value pair | 
|  | 255 | >  c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5  ................ | 
|  | 256 | ^this is ^this is | 
|  | 257 | 1b       3b | 
|  | 258 | c017e7a5,c0199ff5 in the exception table of the kernel. | 
|  | 259 |  | 
|  | 260 | So, what actually happens if a fault from kernel mode with no suitable | 
|  | 261 | vma occurs? | 
|  | 262 |  | 
|  | 263 | 1.) access to invalid address: | 
|  | 264 | > c017e7a5 <do_con_write+e1> movb   (%ebx),%dl | 
|  | 265 | 2.) MMU generates exception | 
|  | 266 | 3.) CPU calls do_page_fault | 
|  | 267 | 4.) do page fault calls search_exception_table (regs->eip == c017e7a5); | 
|  | 268 | 5.) search_exception_table looks up the address c017e7a5 in the | 
|  | 269 | exception table (i.e. the contents of the ELF section __ex_table) | 
|  | 270 | and returns the address of the associated fault handle code c0199ff5. | 
|  | 271 | 6.) do_page_fault modifies its own return address to point to the fault | 
|  | 272 | handle code and returns. | 
|  | 273 | 7.) execution continues in the fault handling code. | 
|  | 274 | 8.) 8a) EAX becomes -EFAULT (== -14) | 
|  | 275 | 8b) DL  becomes zero (the value we "read" from user space) | 
|  | 276 | 8c) execution continues at local label 2 (address of the | 
|  | 277 | instruction immediately after the faulting user access). | 
|  | 278 |  | 
|  | 279 | The steps 8a to 8c in a certain way emulate the faulting instruction. | 
|  | 280 |  | 
|  | 281 | That's it, mostly. If you look at our example, you might ask why | 
|  | 282 | we set EAX to -EFAULT in the exception handler code. Well, the | 
|  | 283 | get_user macro actually returns a value: 0, if the user access was | 
|  | 284 | successful, -EFAULT on failure. Our original code did not test this | 
|  | 285 | return value, however the inline assembly code in get_user tries to | 
|  | 286 | return -EFAULT. GCC selected EAX to return this value. | 
|  | 287 |  | 
|  | 288 | NOTE: | 
|  | 289 | Due to the way that the exception table is built and needs to be ordered, | 
|  | 290 | only use exceptions for code in the .text section.  Any other section | 
|  | 291 | will cause the exception table to not be sorted correctly, and the | 
|  | 292 | exceptions will fail. | 
|  | 293 |  | 
|  | 294 | Things changed when 64-bit support was added to x86 Linux. Rather than | 
|  | 295 | double the size of the exception table by expanding the two entries | 
|  | 296 | from 32-bits to 64 bits, a clever trick was used to store addresses | 
|  | 297 | as relative offsets from the table itself. The assembly code changed | 
|  | 298 | from: | 
|  | 299 | .long 1b,3b | 
|  | 300 | to: | 
|  | 301 | .long (from) - . | 
|  | 302 | .long (to) - . | 
|  | 303 |  | 
|  | 304 | and the C-code that uses these values converts back to absolute addresses | 
|  | 305 | like this: | 
|  | 306 |  | 
|  | 307 | ex_insn_addr(const struct exception_table_entry *x) | 
|  | 308 | { | 
|  | 309 | return (unsigned long)&x->insn + x->insn; | 
|  | 310 | } | 
|  | 311 |  | 
|  | 312 | In v4.6 the exception table entry was expanded with a new field "handler". | 
|  | 313 | This is also 32-bits wide and contains a third relative function | 
|  | 314 | pointer which points to one of: | 
|  | 315 |  | 
|  | 316 | 1) int ex_handler_default(const struct exception_table_entry *fixup) | 
|  | 317 | This is legacy case that just jumps to the fixup code | 
|  | 318 | 2) int ex_handler_fault(const struct exception_table_entry *fixup) | 
|  | 319 | This case provides the fault number of the trap that occurred at | 
|  | 320 | entry->insn. It is used to distinguish page faults from machine | 
|  | 321 | check. | 
|  | 322 | 3) int ex_handler_ext(const struct exception_table_entry *fixup) | 
|  | 323 | This case is used for uaccess_err ... we need to set a flag | 
|  | 324 | in the task structure. Before the handler functions existed this | 
|  | 325 | case was handled by adding a large offset to the fixup to tag | 
|  | 326 | it as special. | 
|  | 327 | More functions can easily be added. |