arch/x86/kernel/ftrace.c

   1 /*
   2  * Code for replacing ftrace calls with jumps.
   3  *
   4  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
   5  *
   6  * Thanks goes to Ingo Molnar, for suggesting the idea.
   7  * Mathieu Desnoyers, for suggesting postponing the modifications.
   8  * Arjan van de Ven, for keeping me straight, and explaining to me
   9  * the dangers of modifying code on the run.
  10  */
  11
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14 #include <linux/spinlock.h>
  15 #include <linux/hardirq.h>
  16 #include <linux/uaccess.h>
  17 #include <linux/ftrace.h>
  18 #include <linux/percpu.h>
  19 #include <linux/sched.h>
  20 #include <linux/init.h>
  21 #include <linux/list.h>
  22
  23 #include <trace/syscall.h>
  24
  25 #include <asm/cacheflush.h>
  26 #include <asm/ftrace.h>
  27 #include <asm/nops.h>
  28 #include <asm/nmi.h>
  29
  30
  31 #ifdef CONFIG_DYNAMIC_FTRACE
  32
  33 /*
  34  * modifying_code is set to notify NMIs that they need to use
  35  * memory barriers when entering or exiting. But we don't want
  36  * to burden NMIs with unnecessary memory barriers when code
  37  * modification is not being done (which is most of the time).
  38  *
  39  * A mutex is already held when ftrace_arch_code_modify_prepare
  40  * and post_process are called. No locks need to be taken here.
  41  *
  42  * Stop machine will make sure currently running NMIs are done
  43  * and new NMIs will see the updated variable before we need
  44  * to worry about NMIs doing memory barriers.
  45  */
  46 static int modifying_code __read_mostly;
  47 static DEFINE_PER_CPU(int, save_modifying_code);
  48
  49 int ftrace_arch_code_modify_prepare(void)
  50 {
  51         set_kernel_text_rw();
  52         modifying_code = 1;
  53         return 0;
  54 }
  55
  56 int ftrace_arch_code_modify_post_process(void)
  57 {
  58         modifying_code = 0;
  59         set_kernel_text_ro();
  60         return 0;
  61 }
  62
  63 union ftrace_code_union {
  64         char code[MCOUNT_INSN_SIZE];
  65         struct {
  66                 char e8;
  67                 int offset;
  68         } __attribute__((packed));
  69 };
  70
  71 static int ftrace_calc_offset(long ip, long addr)
  72 {
  73         return (int)(addr - ip);
  74 }
  75
  76 static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  77 {
  78         static union ftrace_code_union calc;
  79
  80         calc.e8         = 0xe8;
  81         calc.offset     = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
  82
  83         /*
  84          * No locking needed, this must be called via kstop_machine
  85          * which in essence is like running on a uniprocessor machine.
  86          */
  87         return calc.code;
  88 }
  89
  90 /*
  91  * Modifying code must take extra care. On an SMP machine, if
  92  * the code being modified is also being executed on another CPU
  93  * that CPU will have undefined results and possibly take a GPF.
  94  * We use kstop_machine to stop other CPUS from exectuing code.
  95  * But this does not stop NMIs from happening. We still need
  96  * to protect against that. We separate out the modification of
  97  * the code to take care of this.
  98  *
  99  * Two buffers are added: An IP buffer and a "code" buffer.
 100  *
 101  * 1) Put the instruction pointer into the IP buffer
 102  *    and the new code into the "code" buffer.
 103  * 2) Wait for any running NMIs to finish and set a flag that says
 104  *    we are modifying code, it is done in an atomic operation.
 105  * 3) Write the code
 106  * 4) clear the flag.
 107  * 5) Wait for any running NMIs to finish.
 108  *
 109  * If an NMI is executed, the first thing it does is to call
 110  * "ftrace_nmi_enter". This will check if the flag is set to write
 111  * and if it is, it will write what is in the IP and "code" buffers.
 112  *
 113  * The trick is, it does not matter if everyone is writing the same
 114  * content to the code location. Also, if a CPU is executing code
 115  * it is OK to write to that code location if the contents being written
 116  * are the same as what exists.
 117  */
 118
 119 #define MOD_CODE_WRITE_FLAG (1 << 31)   /* set when NMI should do the write */
 120 static atomic_t nmi_running = ATOMIC_INIT(0);
 121 static int mod_code_status;             /* holds return value of text write */
 122 static void *mod_code_ip;               /* holds the IP to write to */
 123 static void *mod_code_newcode;          /* holds the text to write to the IP */
 124
 125 static unsigned nmi_wait_count;
 126 static atomic_t nmi_update_count = ATOMIC_INIT(0);
 127
 128 int ftrace_arch_read_dyn_info(char *buf, int size)
 129 {
 130         int r;
 131
 132         r = snprintf(buf, size, "%u %u",
 133                      nmi_wait_count,
 134                      atomic_read(&nmi_update_count));
 135         return r;
 136 }
 137
 138 static void clear_mod_flag(void)
 139 {
 140         int old = atomic_read(&nmi_running);
 141
 142         for (;;) {
 143                 int new = old & ~MOD_CODE_WRITE_FLAG;
 144
 145                 if (old == new)
 146                         break;
 147
 148                 old = atomic_cmpxchg(&nmi_running, old, new);
 149         }
 150 }
 151
 152 static void ftrace_mod_code(void)
 153 {
 154         /*
 155          * Yes, more than one CPU process can be writing to mod_code_status.
 156          *    (and the code itself)
 157          * But if one were to fail, then they all should, and if one were
 158          * to succeed, then they all should.
 159          */
 160         mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
 161                                              MCOUNT_INSN_SIZE);
 162
 163         /* if we fail, then kill any new writers */
 164         if (mod_code_status)
 165                 clear_mod_flag();
 166 }
 167
 168 void ftrace_nmi_enter(void)
 169 {
 170         __get_cpu_var(save_modifying_code) = modifying_code;
 171
 172         if (!__get_cpu_var(save_modifying_code))
 173                 return;
 174
 175         if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
 176                 smp_rmb();
 177                 ftrace_mod_code();
 178                 atomic_inc(&nmi_update_count);
 179         }
 180         /* Must have previous changes seen before executions */
 181         smp_mb();
 182 }
 183
 184 void ftrace_nmi_exit(void)
 185 {
 186         if (!__get_cpu_var(save_modifying_code))
 187                 return;
 188
 189         /* Finish all executions before clearing nmi_running */
 190         smp_mb();
 191         atomic_dec(&nmi_running);
 192 }
 193
 194 static void wait_for_nmi_and_set_mod_flag(void)
 195 {
 196         if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
 197                 return;
 198
 199         do {
 200                 cpu_relax();
 201         } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
 202
 203         nmi_wait_count++;
 204 }
 205
 206 static void wait_for_nmi(void)
 207 {
 208         if (!atomic_read(&nmi_running))
 209                 return;
 210
 211         do {
 212                 cpu_relax();
 213         } while (atomic_read(&nmi_running));
 214
 215         nmi_wait_count++;
 216 }
 217
 218 static inline int
 219 within(unsigned long addr, unsigned long start, unsigned long end)
 220 {
 221         return addr >= start && addr < end;
 222 }
 223
 224 static int
 225 do_ftrace_mod_code(unsigned long ip, void *new_code)
 226 {
 227         /*
 228          * On x86_64, kernel text mappings are mapped read-only with
 229          * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
 230          * of the kernel text mapping to modify the kernel text.
 231          *
 232          * For 32bit kernels, these mappings are same and we can use
 233          * kernel identity mapping to modify code.
 234          */
 235         if (within(ip, (unsigned long)_text, (unsigned long)_etext))
 236                 ip = (unsigned long)__va(__pa(ip));
 237
 238         mod_code_ip = (void *)ip;
 239         mod_code_newcode = new_code;
 240
 241         /* The buffers need to be visible before we let NMIs write them */
 242         smp_mb();
 243
 244         wait_for_nmi_and_set_mod_flag();
 245
 246         /* Make sure all running NMIs have finished before we write the code */
 247         smp_mb();
 248
 249         ftrace_mod_code();
 250
 251         /* Make sure the write happens before clearing the bit */
 252         smp_mb();
 253
 254         clear_mod_flag();
 255         wait_for_nmi();
 256
 257         return mod_code_status;
 258 }
 259
 260 static unsigned char *ftrace_nop_replace(void)
 261 {
 262         return ideal_nop5;
 263 }
 264
 265 static int
 266 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 267                    unsigned char *new_code)
 268 {
 269         unsigned char replaced[MCOUNT_INSN_SIZE];
 270
 271         /*
 272          * Note: Due to modules and __init, code can
 273          *  disappear and change, we need to protect against faulting
 274          *  as well as code changing. We do this by using the
 275          *  probe_kernel_* functions.
 276          *
 277          * No real locking needed, this code is run through
 278          * kstop_machine, or before SMP starts.
 279          */
 280
 281         /* read the text we want to modify */
 282         if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
 283                 return -EFAULT;
 284
 285         /* Make sure it is what we expect it to be */
 286         if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
 287                 return -EINVAL;
 288
 289         /* replace the text with the new text */
 290         if (do_ftrace_mod_code(ip, new_code))
 291                 return -EPERM;
 292
 293         sync_core();
 294
 295         return 0;
 296 }
 297
 298 int ftrace_make_nop(struct module *mod,
 299                     struct dyn_ftrace *rec, unsigned long addr)
 300 {
 301         unsigned char *new, *old;
 302         unsigned long ip = rec->ip;
 303
 304         old = ftrace_call_replace(ip, addr);
 305         new = ftrace_nop_replace();
 306
 307         return ftrace_modify_code(rec->ip, old, new);
 308 }
 309
 310 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 311 {
 312         unsigned char *new, *old;
 313         unsigned long ip = rec->ip;
 314
 315         old = ftrace_nop_replace();
 316         new = ftrace_call_replace(ip, addr);
 317
 318         return ftrace_modify_code(rec->ip, old, new);
 319 }
 320
 321 int ftrace_update_ftrace_func(ftrace_func_t func)
 322 {
 323         unsigned long ip = (unsigned long)(&ftrace_call);
 324         unsigned char old[MCOUNT_INSN_SIZE], *new;
 325         int ret;
 326
 327         memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 328         new = ftrace_call_replace(ip, (unsigned long)func);
 329         ret = ftrace_modify_code(ip, old, new);
 330
 331         return ret;
 332 }
 333
 334 int __init ftrace_dyn_arch_init(void *data)
 335 {
 336         /* The return code is retured via data */
 337         *(unsigned long *)data = 0;
 338
 339         return 0;
 340 }
 341 #endif
 342
 343 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 344
 345 #ifdef CONFIG_DYNAMIC_FTRACE
 346 extern void ftrace_graph_call(void);
 347
 348 static int ftrace_mod_jmp(unsigned long ip,
 349                           int old_offset, int new_offset)
 350 {
 351         unsigned char code[MCOUNT_INSN_SIZE];
 352
 353         if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
 354                 return -EFAULT;
 355
 356         if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
 357                 return -EINVAL;
 358
 359         *(int *)(&code[1]) = new_offset;
 360
 361         if (do_ftrace_mod_code(ip, &code))
 362                 return -EPERM;
 363
 364         return 0;
 365 }
 366
 367 int ftrace_enable_ftrace_graph_caller(void)
 368 {
 369         unsigned long ip = (unsigned long)(&ftrace_graph_call);
 370         int old_offset, new_offset;
 371
 372         old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
 373         new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
 374
 375         return ftrace_mod_jmp(ip, old_offset, new_offset);
 376 }
 377
 378 int ftrace_disable_ftrace_graph_caller(void)
 379 {
 380         unsigned long ip = (unsigned long)(&ftrace_graph_call);
 381         int old_offset, new_offset;
 382
 383         old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
 384         new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
 385
 386         return ftrace_mod_jmp(ip, old_offset, new_offset);
 387 }
 388
 389 #endif /* !CONFIG_DYNAMIC_FTRACE */
 390
 391 /*
 392  * Hook the return address and push it in the stack of return addrs
 393  * in current thread info.
 394  */
 395 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 396                            unsigned long frame_pointer)
 397 {
 398         unsigned long old;
 399         int faulted;
 400         struct ftrace_graph_ent trace;
 401         unsigned long return_hooker = (unsigned long)
 402                                 &return_to_handler;
 403
 404         if (unlikely(atomic_read(&current->tracing_graph_pause)))
 405                 return;
 406
 407         /*
 408          * Protect against fault, even if it shouldn't
 409          * happen. This tool is too much intrusive to
 410          * ignore such a protection.
 411          */
 412         asm volatile(
 413                 "1: " _ASM_MOV " (%[parent]), %[old]\n"
 414                 "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
 415                 "   movl $0, %[faulted]\n"
 416                 "3:\n"
 417
 418                 ".section .fixup, \"ax\"\n"
 419                 "4: movl $1, %[faulted]\n"
 420                 "   jmp 3b\n"
 421                 ".previous\n"
 422
 423                 _ASM_EXTABLE(1b, 4b)
 424                 _ASM_EXTABLE(2b, 4b)
 425
 426                 : [old] "=&r" (old), [faulted] "=r" (faulted)
 427                 : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
 428                 : "memory"
 429         );
 430
 431         if (unlikely(faulted)) {
 432                 ftrace_graph_stop();
 433                 WARN_ON(1);
 434                 return;
 435         }
 436
 437         if (ftrace_push_return_trace(old, self_addr, &trace.depth,
 438                     frame_pointer) == -EBUSY) {
 439                 *parent = old;
 440                 return;
 441         }
 442
 443         trace.func = self_addr;
 444
 445         /* Only trace if the calling function expects to */
 446         if (!ftrace_graph_entry(&trace)) {
 447                 current->curr_ret_stack--;
 448                 *parent = old;
 449         }
 450 }
 451 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */