1 /* smp.c: Sparc64 SMP support.
3 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
6 #include <linux/kernel.h>
7 #include <linux/sched.h>
9 #include <linux/pagemap.h>
10 #include <linux/threads.h>
11 #include <linux/smp.h>
12 #include <linux/smp_lock.h>
13 #include <linux/interrupt.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/delay.h>
16 #include <linux/init.h>
17 #include <linux/spinlock.h>
20 #include <asm/ptrace.h>
21 #include <asm/atomic.h>
25 #include <asm/pgtable.h>
26 #include <asm/oplib.h>
27 #include <asm/hardirq.h>
28 #include <asm/softirq.h>
29 #include <asm/uaccess.h>
30 #include <asm/timer.h>
32 #define __KERNEL_SYSCALLS__
33 #include <linux/unistd.h>
35 extern int linux_num_cpus
;
36 extern void calibrate_delay(void);
37 extern unsigned prom_cpu_nodes
[];
39 struct cpuinfo_sparc cpu_data
[NR_CPUS
] __attribute__ ((aligned (64)));
41 volatile int cpu_number_map
[NR_CPUS
] __attribute__ ((aligned (64)));
42 volatile int __cpu_logical_map
[NR_CPUS
] __attribute__ ((aligned (64)));
44 /* Please don't make this stuff initdata!!! --DaveM */
45 static unsigned char boot_cpu_id
= 0;
46 static int smp_activated
= 0;
49 spinlock_t kernel_flag
= SPIN_LOCK_UNLOCKED
;
51 volatile int smp_processors_ready
= 0;
52 unsigned long cpu_present_map
= 0;
54 int smp_threads_ready
= 0;
56 void __init
smp_setup(char *str
, int *ints
)
58 /* XXX implement me XXX */
61 int smp_info(char *buf
)
65 strcpy(buf
, "State:\n");
66 for (i
= 0; i
< NR_CPUS
; i
++)
67 if(cpu_present_map
& (1UL << i
))
68 len
+= sprintf(buf
+ len
,
69 "CPU%d:\t\tonline\n", i
);
73 int smp_bogo(char *buf
)
77 for (i
= 0; i
< NR_CPUS
; i
++)
78 if(cpu_present_map
& (1UL << i
))
79 len
+= sprintf(buf
+ len
,
80 "Cpu%dBogo\t: %lu.%02lu\n",
81 i
, cpu_data
[i
].udelay_val
/ 500000,
82 (cpu_data
[i
].udelay_val
/ 5000) % 100);
86 void __init
smp_store_cpu_info(int id
)
90 cpu_data
[id
].irq_count
= 0;
91 cpu_data
[id
].bh_count
= 0;
92 /* multiplier and counter set by
93 smp_setup_percpu_timer() */
94 cpu_data
[id
].udelay_val
= loops_per_sec
;
96 cpu_data
[id
].pgcache_size
= 0;
97 cpu_data
[id
].pte_cache
= NULL
;
98 cpu_data
[id
].pgdcache_size
= 0;
99 cpu_data
[id
].pgd_cache
= NULL
;
100 cpu_data
[id
].idle_volume
= 1;
102 for(i
= 0; i
< 16; i
++)
103 cpu_data
[id
].irq_worklists
[i
] = 0;
106 void __init
smp_commence(void)
110 static void smp_setup_percpu_timer(void);
111 static void smp_tune_scheduling(void);
113 static volatile unsigned long callin_flag
= 0;
115 extern void inherit_locked_prom_mappings(int save_p
);
116 extern void cpu_probe(void);
118 void __init
smp_callin(void)
120 int cpuid
= hard_smp_processor_id();
122 inherit_locked_prom_mappings(0);
129 /* Master did this already, now is the time for us to do it. */
130 __asm__
__volatile__("
131 sethi %%hi(0x80000000), %%g1
135 andn %%g2, %%g1, %%g2
141 smp_setup_percpu_timer();
146 smp_store_cpu_info(cpuid
);
148 __asm__
__volatile__("membar #Sync\n\t"
149 "flush %%g6" : : : "memory");
151 /* Clear this or we will die instantly when we
152 * schedule back to this idler...
154 current
->thread
.flags
&= ~(SPARC_FLAG_NEWCHILD
);
156 /* Attach to the address space of init_task. */
157 atomic_inc(&init_mm
.mm_count
);
158 current
->active_mm
= &init_mm
;
160 while(!smp_processors_ready
)
164 extern int cpu_idle(void);
165 extern void init_IRQ(void);
167 void initialize_secondary(void)
171 int start_secondary(void *unused
)
181 printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id());
182 panic("SMP bolixed\n");
185 extern struct prom_cpuinfo linux_cpus
[64];
187 extern unsigned long smp_trampoline
;
189 /* The OBP cpu startup callback truncates the 3rd arg cookie to
190 * 32-bits (I think) so to be safe we have it read the pointer
191 * contained here so we work on >4GB machines. -DaveM
193 static struct task_struct
*cpu_new_task
= NULL
;
195 void __init
smp_boot_cpus(void)
199 printk("Entering UltraSMPenguin Mode...\n");
201 smp_store_cpu_info(boot_cpu_id
);
202 smp_tune_scheduling();
205 if(linux_num_cpus
== 1)
208 for(i
= 0; i
< NR_CPUS
; i
++) {
212 if(cpu_present_map
& (1UL << i
)) {
213 unsigned long entry
= (unsigned long)(&smp_trampoline
);
214 unsigned long cookie
= (unsigned long)(&cpu_new_task
);
215 struct task_struct
*p
;
218 extern unsigned long phys_base
;
220 entry
+= phys_base
- KERNBASE
;
221 cookie
+= phys_base
- KERNBASE
;
222 kernel_thread(start_secondary
, NULL
, CLONE_PID
);
225 p
= init_task
.prev_task
;
226 init_tasks
[cpucount
] = p
;
229 p
->has_cpu
= 1; /* we schedule the first task manually */
231 del_from_runqueue(p
);
235 for (no
= 0; no
< linux_num_cpus
; no
++)
236 if (linux_cpus
[no
].mid
== i
)
239 prom_startcpu(linux_cpus
[no
].prom_node
,
241 for(timeout
= 0; timeout
< 5000000; timeout
++) {
247 cpu_number_map
[i
] = cpucount
;
248 __cpu_logical_map
[cpucount
] = i
;
249 prom_cpu_nodes
[i
] = linux_cpus
[no
].prom_node
;
252 printk("Processor %d is stuck.\n", i
);
256 cpu_present_map
&= ~(1UL << i
);
257 cpu_number_map
[i
] = -1;
262 printk("Error: only one processor found.\n");
263 cpu_present_map
= (1UL << smp_processor_id());
265 unsigned long bogosum
= 0;
267 for(i
= 0; i
< NR_CPUS
; i
++) {
268 if(cpu_present_map
& (1UL << i
))
269 bogosum
+= cpu_data
[i
].udelay_val
;
271 printk("Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
273 (bogosum
+ 2500)/500000,
274 ((bogosum
+ 2500)/5000)%100);
276 smp_num_cpus
= cpucount
+ 1;
278 smp_processors_ready
= 1;
279 membar("#StoreStore | #StoreLoad");
282 /* #define XCALL_DEBUG */
284 static inline void xcall_deliver(u64 data0
, u64 data1
, u64 data2
, u64 pstate
, unsigned long cpu
)
286 u64 result
, target
= (cpu
<< 14) | 0x70;
290 printk("CPU[%d]: xcall(data[%016lx:%016lx:%016lx],tgt[%016lx])\n",
291 smp_processor_id(), data0
, data1
, data2
, target
);
295 __asm__
__volatile__("
296 wrpr %1, %2, %%pstate
305 : "r" (pstate
), "i" (PSTATE_IE
), "i" (ASI_UDB_INTR_W
),
306 "r" (data0
), "r" (data1
), "r" (data2
), "r" (target
), "r" (0x10), "0" (tmp
));
308 /* NOTE: PSTATE_IE is still clear. */
311 __asm__
__volatile__("ldxa [%%g0] %1, %0"
313 : "i" (ASI_INTR_DISPATCH_STAT
));
315 __asm__
__volatile__("wrpr %0, 0x0, %%pstate"
322 } while(result
& 0x1);
323 __asm__
__volatile__("wrpr %0, 0x0, %%pstate"
327 printk("CPU[%d]: mondo stuckage result[%016lx]\n",
328 smp_processor_id(), result
);
332 printk("CPU[%d]: Penguin %d NACK's master.\n", smp_processor_id(), cpu
);
339 void smp_cross_call(unsigned long *func
, u32 ctx
, u64 data1
, u64 data2
)
341 if(smp_processors_ready
) {
342 unsigned long mask
= (cpu_present_map
& ~(1UL<<smp_processor_id()));
343 u64 pstate
, data0
= (((u64
)ctx
)<<32 | (((u64
)func
) & 0xffffffff));
344 int i
, ncpus
= smp_num_cpus
- 1;
346 __asm__
__volatile__("rdpr %%pstate, %0" : "=r" (pstate
));
347 for(i
= 0; i
< NR_CPUS
; i
++) {
348 if(mask
& (1UL << i
)) {
349 xcall_deliver(data0
, data1
, data2
, pstate
, i
);
354 /* NOTE: Caller runs local copy on master. */
358 extern unsigned long xcall_flush_tlb_page
;
359 extern unsigned long xcall_flush_tlb_mm
;
360 extern unsigned long xcall_flush_tlb_range
;
361 extern unsigned long xcall_flush_tlb_all
;
362 extern unsigned long xcall_tlbcachesync
;
363 extern unsigned long xcall_flush_cache_all
;
364 extern unsigned long xcall_report_regs
;
365 extern unsigned long xcall_receive_signal
;
367 void smp_receive_signal(int cpu
)
369 if(smp_processors_ready
&&
370 (cpu_present_map
& (1UL<<cpu
)) != 0) {
371 u64 pstate
, data0
= (((u64
)&xcall_receive_signal
) & 0xffffffff);
372 __asm__
__volatile__("rdpr %%pstate, %0" : "=r" (pstate
));
373 xcall_deliver(data0
, 0, 0, pstate
, cpu
);
377 void smp_report_regs(void)
379 smp_cross_call(&xcall_report_regs
, 0, 0, 0);
382 void smp_flush_cache_all(void)
384 smp_cross_call(&xcall_flush_cache_all
, 0, 0, 0);
388 void smp_flush_tlb_all(void)
390 smp_cross_call(&xcall_flush_tlb_all
, 0, 0, 0);
394 /* We know that the window frames of the user have been flushed
395 * to the stack before we get here because all callers of us
396 * are flush_tlb_*() routines, and these run after flush_cache_*()
397 * which performs the flushw.
399 * XXX I diked out the fancy flush avoidance code for the
400 * XXX swapping cases for now until the new MM code stabilizes. -DaveM
402 * The SMP TLB coherency scheme we use works as follows:
404 * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
405 * space has (potentially) executed on, this is the heuristic
406 * we use to avoid doing cross calls.
408 * 2) TLB context numbers are shared globally across all processors
409 * in the system, this allows us to play several games to avoid
412 * One invariant is that when a cpu switches to a process, and
413 * that processes tsk->active_mm->cpu_vm_mask does not have the
414 * current cpu's bit set, that tlb context is flushed locally.
416 * If the address space is non-shared (ie. mm->count == 1) we avoid
417 * cross calls when we want to flush the currently running process's
418 * tlb state. This is done by clearing all cpu bits except the current
419 * processor's in current->active_mm->cpu_vm_mask and performing the
420 * flush locally only. This will force any subsequent cpus which run
421 * this task to flush the context from the local tlb if the process
422 * migrates to another cpu (again).
424 * 3) For shared address spaces (threads) and swapping we bite the
425 * bullet for most cases and perform the cross call.
427 * The performance gain from "optimizing" away the cross call for threads is
428 * questionable (in theory the big win for threads is the massive sharing of
429 * address space state across processors).
431 * For the swapping case the locking is difficult to get right, we'd have to
432 * enforce strict ordered access to mm->cpu_vm_mask via a spinlock for example.
433 * Then again one could argue that when you are swapping, the cost of a cross
434 * call won't even show up on the performance radar. But in any case we do get
435 * rid of the cross-call when the task has a dead context or the task has only
436 * ever run on the local cpu.
438 void smp_flush_tlb_mm(struct mm_struct
*mm
)
440 u32 ctx
= CTX_HWBITS(mm
->context
);
442 if (mm
== current
->active_mm
&&
443 atomic_read(&mm
->mm_users
) == 1 &&
444 (mm
->cpu_vm_mask
== (1UL << smp_processor_id())))
445 goto local_flush_and_out
;
447 smp_cross_call(&xcall_flush_tlb_mm
, ctx
, 0, 0);
450 __flush_tlb_mm(ctx
, SECONDARY_CONTEXT
);
453 void smp_flush_tlb_range(struct mm_struct
*mm
, unsigned long start
,
456 u32 ctx
= CTX_HWBITS(mm
->context
);
460 if(mm
== current
->active_mm
&&
461 atomic_read(&mm
->mm_users
) == 1 &&
462 (mm
->cpu_vm_mask
== (1UL << smp_processor_id())))
463 goto local_flush_and_out
;
465 smp_cross_call(&xcall_flush_tlb_range
, ctx
, start
, end
);
468 __flush_tlb_range(ctx
, start
, SECONDARY_CONTEXT
, end
, PAGE_SIZE
, (end
-start
));
471 void smp_flush_tlb_page(struct mm_struct
*mm
, unsigned long page
)
473 u32 ctx
= CTX_HWBITS(mm
->context
);
476 if(mm
== current
->active_mm
&&
477 atomic_read(&mm
->mm_users
) == 1 &&
478 (mm
->cpu_vm_mask
== (1UL << smp_processor_id()))) {
479 goto local_flush_and_out
;
482 smp_cross_call(&xcall_flush_tlb_page
, ctx
, page
, 0);
485 __flush_tlb_page(ctx
, page
, SECONDARY_CONTEXT
);
489 /* #define CAPTURE_DEBUG */
490 extern unsigned long xcall_capture
;
492 static atomic_t smp_capture_depth
= ATOMIC_INIT(0);
493 static atomic_t smp_capture_registry
= ATOMIC_INIT(0);
494 static unsigned long penguins_are_doing_time
= 0;
496 void smp_capture(void)
498 if (smp_processors_ready
) {
499 int result
= atomic_add_return(1, &smp_capture_depth
);
501 membar("#StoreStore | #LoadStore");
503 int ncpus
= smp_num_cpus
;
506 printk("CPU[%d]: Sending penguins to jail...",
509 penguins_are_doing_time
= 1;
510 membar("#StoreStore | #LoadStore");
511 atomic_inc(&smp_capture_registry
);
512 smp_cross_call(&xcall_capture
, 0, 0, 0);
513 while(atomic_read(&smp_capture_registry
) != ncpus
)
522 void smp_release(void)
524 if(smp_processors_ready
) {
525 if(atomic_dec_and_test(&smp_capture_depth
)) {
527 printk("CPU[%d]: Giving pardon to imprisoned penguins\n",
530 penguins_are_doing_time
= 0;
531 membar("#StoreStore | #StoreLoad");
532 atomic_dec(&smp_capture_registry
);
537 /* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
538 * can service tlb flush xcalls...
540 void smp_penguin_jailcell(void)
543 atomic_inc(&smp_capture_registry
);
544 membar("#StoreLoad | #StoreStore");
545 while(penguins_are_doing_time
)
547 atomic_dec(&smp_capture_registry
);
550 static inline void sparc64_do_profile(unsigned long pc
, unsigned long g3
)
552 if (prof_buffer
&& current
->pid
) {
554 extern int rwlock_impl_begin
, rwlock_impl_end
;
555 extern int atomic_impl_begin
, atomic_impl_end
;
557 if ((pc
>= (unsigned long) &rwlock_impl_begin
&&
558 pc
< (unsigned long) &rwlock_impl_end
) ||
559 (pc
>= (unsigned long) &atomic_impl_begin
&&
560 pc
< (unsigned long) &atomic_impl_end
))
563 pc
-= (unsigned long) &_stext
;
568 atomic_inc((atomic_t
*)&prof_buffer
[pc
]);
572 static unsigned long current_tick_offset
;
574 #define prof_multiplier(__cpu) cpu_data[(__cpu)].multiplier
575 #define prof_counter(__cpu) cpu_data[(__cpu)].counter
577 extern void update_one_process(struct task_struct
*p
, unsigned long ticks
,
578 unsigned long user
, unsigned long system
,
581 void smp_percpu_timer_interrupt(struct pt_regs
*regs
)
583 unsigned long compare
, tick
;
584 int cpu
= smp_processor_id();
585 int user
= user_mode(regs
);
588 * Check for level 14 softint.
590 if (!(get_softint() & (1UL << 0))) {
591 extern void handler_irq(int, struct pt_regs
*);
593 handler_irq(14, regs
);
597 clear_softint((1UL << 0));
600 sparc64_do_profile(regs
->tpc
, regs
->u_regs
[UREG_G3
]);
601 if(!--prof_counter(cpu
))
603 if (cpu
== boot_cpu_id
) {
604 /* XXX Keep this in sync with irq.c --DaveM */
605 #define irq_enter(cpu, irq) \
606 do { hardirq_enter(cpu); \
607 spin_unlock_wait(&global_irq_lock); \
609 #define irq_exit(cpu, irq) hardirq_exit(cpu)
612 kstat
.irqs
[cpu
][0]++;
614 timer_tick_interrupt(regs
);
623 unsigned int *inc
, *inc2
;
625 update_one_process(current
, 1, user
, !user
, cpu
);
626 if(--current
->counter
<= 0) {
627 current
->counter
= 0;
628 current
->need_resched
= 1;
632 if(current
->priority
< DEF_PRIORITY
) {
633 inc
= &kstat
.cpu_nice
;
634 inc2
= &kstat
.per_cpu_nice
[cpu
];
636 inc
= &kstat
.cpu_user
;
637 inc2
= &kstat
.per_cpu_user
[cpu
];
640 inc
= &kstat
.cpu_system
;
641 inc2
= &kstat
.per_cpu_system
[cpu
];
643 atomic_inc((atomic_t
*)inc
);
644 atomic_inc((atomic_t
*)inc2
);
646 prof_counter(cpu
) = prof_multiplier(cpu
);
649 __asm__
__volatile__("rd %%tick_cmpr, %0\n\t"
651 "wr %0, 0x0, %%tick_cmpr\n\t"
653 : "=&r" (compare
), "=r" (tick
)
654 : "r" (current_tick_offset
));
655 } while (tick
>= compare
);
658 static void __init
smp_setup_percpu_timer(void)
660 int cpu
= smp_processor_id();
662 prof_counter(cpu
) = prof_multiplier(cpu
) = 1;
664 __asm__
__volatile__("rd %%tick, %%g1\n\t"
665 "add %%g1, %0, %%g1\n\t"
666 "wr %%g1, 0x0, %%tick_cmpr"
668 : "r" (current_tick_offset
)
672 void __init
smp_tick_init(void)
676 boot_cpu_id
= hard_smp_processor_id();
677 current_tick_offset
= timer_tick_offset
;
679 for(i
= 0; i
< linux_num_cpus
; i
++)
680 cpu_present_map
|= (1UL << linux_cpus
[i
].mid
);
681 for(i
= 0; i
< NR_CPUS
; i
++) {
682 cpu_number_map
[i
] = -1;
683 __cpu_logical_map
[i
] = -1;
685 cpu_number_map
[boot_cpu_id
] = 0;
686 prom_cpu_nodes
[boot_cpu_id
] = linux_cpus
[0].prom_node
;
687 __cpu_logical_map
[0] = boot_cpu_id
;
688 current
->processor
= boot_cpu_id
;
689 prof_counter(boot_cpu_id
) = prof_multiplier(boot_cpu_id
) = 1;
692 static inline unsigned long find_flush_base(unsigned long size
)
694 struct page
*p
= mem_map
;
695 unsigned long found
, base
;
697 size
= PAGE_ALIGN(size
);
699 base
= page_address(p
);
702 if(p
>= (mem_map
+ max_mapnr
))
706 base
= page_address(p
);
716 cycles_t cacheflush_time
;
718 static void __init
smp_tune_scheduling (void)
720 unsigned long flush_base
, flags
, *p
;
721 unsigned int ecache_size
;
722 cycles_t tick1
, tick2
, raw
;
724 /* Approximate heuristic for SMP scheduling. It is an
725 * estimation of the time it takes to flush the L2 cache
726 * on the local processor.
728 * The ia32 chooses to use the L1 cache flush time instead,
729 * and I consider this complete nonsense. The Ultra can service
730 * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
731 * L2 misses are what create extra bus traffic (ie. the "cost"
732 * of moving a process from one cpu to another).
734 printk("SMP: Calibrating ecache flush... ");
735 ecache_size
= prom_getintdefault(linux_cpus
[0].prom_node
,
736 "ecache-size", (512 *1024));
737 flush_base
= find_flush_base(ecache_size
<< 1);
739 if(flush_base
!= 0UL) {
740 __save_and_cli(flags
);
742 /* Scan twice the size once just to get the TLB entries
743 * loaded and make sure the second scan measures pure misses.
745 for(p
= (unsigned long *)flush_base
;
746 ((unsigned long)p
) < (flush_base
+ (ecache_size
<<1));
747 p
+= (64 / sizeof(unsigned long)))
748 *((volatile unsigned long *)p
);
750 /* Now the real measurement. */
751 __asm__
__volatile__("
756 1: ldx [%2 + 0x000], %%g1
757 ldx [%2 + 0x040], %%g2
758 ldx [%2 + 0x080], %%g3
759 ldx [%2 + 0x0c0], %%g5
766 : "=&r" (tick1
), "=&r" (tick2
), "=&r" (flush_base
)
767 : "2" (flush_base
), "r" (flush_base
+ ecache_size
)
768 : "g1", "g2", "g3", "g5");
770 __restore_flags(flags
);
772 raw
= (tick2
- tick1
);
774 /* Dampen it a little, considering two processes
775 * sharing the cache and fitting.
777 cacheflush_time
= (raw
- (raw
>> 2));
779 cacheflush_time
= ((ecache_size
<< 2) +
782 printk("Using heuristic of %d cycles.\n",
783 (int) cacheflush_time
);
786 /* /proc/profile writes can call this, don't __init it please. */
787 int setup_profiling_timer(unsigned int multiplier
)
792 if((!multiplier
) || (timer_tick_offset
/ multiplier
) < 1000)
796 for(i
= 0; i
< NR_CPUS
; i
++) {
797 if(cpu_present_map
& (1UL << i
))
798 prof_multiplier(i
) = multiplier
;
800 current_tick_offset
= (timer_tick_offset
/ multiplier
);
801 restore_flags(flags
);