1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2002 Richard Henderson
4 * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
5 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
6 * Copyright (C) 2024 Mike Rapoport IBM.
9 #define pr_fmt(fmt) "execmem: " fmt
12 #include <linux/mutex.h>
13 #include <linux/vmalloc.h>
14 #include <linux/execmem.h>
15 #include <linux/maple_tree.h>
16 #include <linux/set_memory.h>
17 #include <linux/moduleloader.h>
18 #include <linux/text-patching.h>
20 #include <asm/tlbflush.h>
24 static struct execmem_info
*execmem_info __ro_after_init
;
25 static struct execmem_info default_execmem_info __ro_after_init
;
28 static void *execmem_vmalloc(struct execmem_range
*range
, size_t size
,
29 pgprot_t pgprot
, unsigned long vm_flags
)
31 bool kasan
= range
->flags
& EXECMEM_KASAN_SHADOW
;
32 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_NOWARN
;
33 unsigned int align
= range
->alignment
;
34 unsigned long start
= range
->start
;
35 unsigned long end
= range
->end
;
39 vm_flags
|= VM_DEFER_KMEMLEAK
;
41 if (vm_flags
& VM_ALLOW_HUGE_VMAP
)
44 p
= __vmalloc_node_range(size
, align
, start
, end
, gfp_flags
,
45 pgprot
, vm_flags
, NUMA_NO_NODE
,
46 __builtin_return_address(0));
47 if (!p
&& range
->fallback_start
) {
48 start
= range
->fallback_start
;
49 end
= range
->fallback_end
;
50 p
= __vmalloc_node_range(size
, align
, start
, end
, gfp_flags
,
51 pgprot
, vm_flags
, NUMA_NO_NODE
,
52 __builtin_return_address(0));
56 pr_warn_ratelimited("unable to allocate memory\n");
60 if (kasan
&& (kasan_alloc_module_shadow(p
, size
, GFP_KERNEL
) < 0)) {
68 struct vm_struct
*execmem_vmap(size_t size
)
70 struct execmem_range
*range
= &execmem_info
->ranges
[EXECMEM_MODULE_DATA
];
71 struct vm_struct
*area
;
73 area
= __get_vm_area_node(size
, range
->alignment
, PAGE_SHIFT
, VM_ALLOC
,
74 range
->start
, range
->end
, NUMA_NO_NODE
,
75 GFP_KERNEL
, __builtin_return_address(0));
76 if (!area
&& range
->fallback_start
)
77 area
= __get_vm_area_node(size
, range
->alignment
, PAGE_SHIFT
, VM_ALLOC
,
78 range
->fallback_start
, range
->fallback_end
,
79 NUMA_NO_NODE
, GFP_KERNEL
, __builtin_return_address(0));
84 static void *execmem_vmalloc(struct execmem_range
*range
, size_t size
,
85 pgprot_t pgprot
, unsigned long vm_flags
)
89 #endif /* CONFIG_MMU */
91 #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
92 struct execmem_cache
{
94 struct maple_tree busy_areas
;
95 struct maple_tree free_areas
;
98 static struct execmem_cache execmem_cache
= {
99 .mutex
= __MUTEX_INITIALIZER(execmem_cache
.mutex
),
100 .busy_areas
= MTREE_INIT_EXT(busy_areas
, MT_FLAGS_LOCK_EXTERN
,
101 execmem_cache
.mutex
),
102 .free_areas
= MTREE_INIT_EXT(free_areas
, MT_FLAGS_LOCK_EXTERN
,
103 execmem_cache
.mutex
),
106 static inline unsigned long mas_range_len(struct ma_state
*mas
)
108 return mas
->last
- mas
->index
+ 1;
111 static int execmem_set_direct_map_valid(struct vm_struct
*vm
, bool valid
)
113 unsigned int nr
= (1 << get_vm_area_page_order(vm
));
114 unsigned int updated
= 0;
117 for (int i
= 0; i
< vm
->nr_pages
; i
+= nr
) {
118 err
= set_direct_map_valid_noflush(vm
->pages
[i
], nr
, valid
);
127 for (int i
= 0; i
< updated
; i
+= nr
)
128 set_direct_map_valid_noflush(vm
->pages
[i
], nr
, !valid
);
133 static void execmem_cache_clean(struct work_struct
*work
)
135 struct maple_tree
*free_areas
= &execmem_cache
.free_areas
;
136 struct mutex
*mutex
= &execmem_cache
.mutex
;
137 MA_STATE(mas
, free_areas
, 0, ULONG_MAX
);
141 mas_for_each(&mas
, area
, ULONG_MAX
) {
142 size_t size
= mas_range_len(&mas
);
144 if (IS_ALIGNED(size
, PMD_SIZE
) &&
145 IS_ALIGNED(mas
.index
, PMD_SIZE
)) {
146 struct vm_struct
*vm
= find_vm_area(area
);
148 execmem_set_direct_map_valid(vm
, true);
149 mas_store_gfp(&mas
, NULL
, GFP_KERNEL
);
156 static DECLARE_WORK(execmem_cache_clean_work
, execmem_cache_clean
);
158 static int execmem_cache_add(void *ptr
, size_t size
)
160 struct maple_tree
*free_areas
= &execmem_cache
.free_areas
;
161 struct mutex
*mutex
= &execmem_cache
.mutex
;
162 unsigned long addr
= (unsigned long)ptr
;
163 MA_STATE(mas
, free_areas
, addr
- 1, addr
+ 1);
164 unsigned long lower
, upper
;
169 upper
= addr
+ size
- 1;
172 area
= mas_walk(&mas
);
173 if (area
&& mas
.last
== addr
- 1)
176 area
= mas_next(&mas
, ULONG_MAX
);
177 if (area
&& mas
.index
== addr
+ size
)
180 mas_set_range(&mas
, lower
, upper
);
181 err
= mas_store_gfp(&mas
, (void *)lower
, GFP_KERNEL
);
189 static bool within_range(struct execmem_range
*range
, struct ma_state
*mas
,
192 unsigned long addr
= mas
->index
;
194 if (addr
>= range
->start
&& addr
+ size
< range
->end
)
197 if (range
->fallback_start
&&
198 addr
>= range
->fallback_start
&& addr
+ size
< range
->fallback_end
)
204 static void *__execmem_cache_alloc(struct execmem_range
*range
, size_t size
)
206 struct maple_tree
*free_areas
= &execmem_cache
.free_areas
;
207 struct maple_tree
*busy_areas
= &execmem_cache
.busy_areas
;
208 MA_STATE(mas_free
, free_areas
, 0, ULONG_MAX
);
209 MA_STATE(mas_busy
, busy_areas
, 0, ULONG_MAX
);
210 struct mutex
*mutex
= &execmem_cache
.mutex
;
211 unsigned long addr
, last
, area_size
= 0;
212 void *area
, *ptr
= NULL
;
216 mas_for_each(&mas_free
, area
, ULONG_MAX
) {
217 area_size
= mas_range_len(&mas_free
);
219 if (area_size
>= size
&& within_range(range
, &mas_free
, size
))
223 if (area_size
< size
)
226 addr
= mas_free
.index
;
227 last
= mas_free
.last
;
229 /* insert allocated size to busy_areas at range [addr, addr + size) */
230 mas_set_range(&mas_busy
, addr
, addr
+ size
- 1);
231 err
= mas_store_gfp(&mas_busy
, (void *)addr
, GFP_KERNEL
);
235 mas_store_gfp(&mas_free
, NULL
, GFP_KERNEL
);
236 if (area_size
> size
) {
237 void *ptr
= (void *)(addr
+ size
);
240 * re-insert remaining free size to free_areas at range
241 * [addr + size, last]
243 mas_set_range(&mas_free
, addr
+ size
, last
);
244 err
= mas_store_gfp(&mas_free
, ptr
, GFP_KERNEL
);
246 mas_store_gfp(&mas_busy
, NULL
, GFP_KERNEL
);
257 static int execmem_cache_populate(struct execmem_range
*range
, size_t size
)
259 unsigned long vm_flags
= VM_ALLOW_HUGE_VMAP
;
260 unsigned long start
, end
;
261 struct vm_struct
*vm
;
266 alloc_size
= round_up(size
, PMD_SIZE
);
267 p
= execmem_vmalloc(range
, alloc_size
, PAGE_KERNEL
, vm_flags
);
271 vm
= find_vm_area(p
);
275 /* fill memory with instructions that will trap */
276 execmem_fill_trapping_insns(p
, alloc_size
, /* writable = */ true);
278 start
= (unsigned long)p
;
279 end
= start
+ alloc_size
;
281 vunmap_range(start
, end
);
283 err
= execmem_set_direct_map_valid(vm
, false);
287 err
= vmap_pages_range_noflush(start
, end
, range
->pgprot
, vm
->pages
,
292 err
= execmem_cache_add(p
, alloc_size
);
303 static void *execmem_cache_alloc(struct execmem_range
*range
, size_t size
)
308 p
= __execmem_cache_alloc(range
, size
);
312 err
= execmem_cache_populate(range
, size
);
316 return __execmem_cache_alloc(range
, size
);
319 static bool execmem_cache_free(void *ptr
)
321 struct maple_tree
*busy_areas
= &execmem_cache
.busy_areas
;
322 struct mutex
*mutex
= &execmem_cache
.mutex
;
323 unsigned long addr
= (unsigned long)ptr
;
324 MA_STATE(mas
, busy_areas
, addr
, addr
);
329 area
= mas_walk(&mas
);
334 size
= mas_range_len(&mas
);
336 mas_store_gfp(&mas
, NULL
, GFP_KERNEL
);
339 execmem_fill_trapping_insns(ptr
, size
, /* writable = */ false);
341 execmem_cache_add(ptr
, size
);
343 schedule_work(&execmem_cache_clean_work
);
347 #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
348 static void *execmem_cache_alloc(struct execmem_range
*range
, size_t size
)
353 static bool execmem_cache_free(void *ptr
)
357 #endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */
359 void *execmem_alloc(enum execmem_type type
, size_t size
)
361 struct execmem_range
*range
= &execmem_info
->ranges
[type
];
362 bool use_cache
= range
->flags
& EXECMEM_ROX_CACHE
;
363 unsigned long vm_flags
= VM_FLUSH_RESET_PERMS
;
364 pgprot_t pgprot
= range
->pgprot
;
368 p
= execmem_cache_alloc(range
, size
);
370 p
= execmem_vmalloc(range
, size
, pgprot
, vm_flags
);
372 return kasan_reset_tag(p
);
375 void execmem_free(void *ptr
)
378 * This memory may be RO, and freeing RO memory in an interrupt is not
379 * supported by vmalloc.
381 WARN_ON(in_interrupt());
383 if (!execmem_cache_free(ptr
))
387 void *execmem_update_copy(void *dst
, const void *src
, size_t size
)
389 return text_poke_copy(dst
, src
, size
);
392 bool execmem_is_rox(enum execmem_type type
)
394 return !!(execmem_info
->ranges
[type
].flags
& EXECMEM_ROX_CACHE
);
397 static bool execmem_validate(struct execmem_info
*info
)
399 struct execmem_range
*r
= &info
->ranges
[EXECMEM_DEFAULT
];
401 if (!r
->alignment
|| !r
->start
|| !r
->end
|| !pgprot_val(r
->pgprot
)) {
402 pr_crit("Invalid parameters for execmem allocator, module loading will fail");
406 if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX
)) {
407 for (int i
= EXECMEM_DEFAULT
; i
< EXECMEM_TYPE_MAX
; i
++) {
408 r
= &info
->ranges
[i
];
410 if (r
->flags
& EXECMEM_ROX_CACHE
) {
411 pr_warn_once("ROX cache is not supported\n");
412 r
->flags
&= ~EXECMEM_ROX_CACHE
;
420 static void execmem_init_missing(struct execmem_info
*info
)
422 struct execmem_range
*default_range
= &info
->ranges
[EXECMEM_DEFAULT
];
424 for (int i
= EXECMEM_DEFAULT
+ 1; i
< EXECMEM_TYPE_MAX
; i
++) {
425 struct execmem_range
*r
= &info
->ranges
[i
];
428 if (i
== EXECMEM_MODULE_DATA
)
429 r
->pgprot
= PAGE_KERNEL
;
431 r
->pgprot
= default_range
->pgprot
;
432 r
->alignment
= default_range
->alignment
;
433 r
->start
= default_range
->start
;
434 r
->end
= default_range
->end
;
435 r
->flags
= default_range
->flags
;
436 r
->fallback_start
= default_range
->fallback_start
;
437 r
->fallback_end
= default_range
->fallback_end
;
442 struct execmem_info
* __weak
execmem_arch_setup(void)
447 static void __init
__execmem_init(void)
449 struct execmem_info
*info
= execmem_arch_setup();
452 info
= execmem_info
= &default_execmem_info
;
453 info
->ranges
[EXECMEM_DEFAULT
].start
= VMALLOC_START
;
454 info
->ranges
[EXECMEM_DEFAULT
].end
= VMALLOC_END
;
455 info
->ranges
[EXECMEM_DEFAULT
].pgprot
= PAGE_KERNEL_EXEC
;
456 info
->ranges
[EXECMEM_DEFAULT
].alignment
= 1;
459 if (!execmem_validate(info
))
462 execmem_init_missing(info
);
467 #ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE
468 static int __init
execmem_late_init(void)
473 core_initcall(execmem_late_init
);
475 void __init
execmem_init(void)