sys/arch/xen/x86/hypervisor_machdep.c

   1 /*      $NetBSD: hypervisor_machdep.c,v 1.12 2009/07/29 12:02:08 cegger Exp $   */
   2
   3 /*
   4  *
   5  * Copyright (c) 2004 Christian Limpach.
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /******************************************************************************
  30  * hypervisor.c
  31  *
  32  * Communication to/from hypervisor.
  33  *
  34  * Copyright (c) 2002-2004, K A Fraser
  35  *
  36  * Permission is hereby granted, free of charge, to any person obtaining a copy
  37  * of this software and associated documentation files (the "Software"), to
  38  * deal in the Software without restriction, including without limitation the
  39  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  40  * sell copies of the Software, and to permit persons to whom the Software is
  41  * furnished to do so, subject to the following conditions:
  42  *
  43  * The above copyright notice and this permission notice shall be included in
  44  * all copies or substantial portions of the Software.
  45  *
  46  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  47  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  48  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  49  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  50  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  51  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  52  * DEALINGS IN THE SOFTWARE.
  53  */
  54
  55
  56 #include <sys/cdefs.h>
  57 __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.12 2009/07/29 12:02:08 cegger Exp $");
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/kmem.h>
  62
  63 #include <uvm/uvm_extern.h>
  64
  65 #include <machine/vmparam.h>
  66 #include <machine/pmap.h>
  67
  68 #include <xen/xen.h>
  69 #include <xen/hypervisor.h>
  70 #include <xen/evtchn.h>
  71 #include <xen/xenpmap.h>
  72
  73 #include "opt_xen.h"
  74
  75 /*
  76  * arch-dependent p2m frame lists list (L3 and L2)
  77  * used by Xen for save/restore mappings
  78  */
  79 static unsigned long * l3_p2m_page;
  80 static unsigned long * l2_p2m_page;
  81 static int l2_p2m_page_size; /* size of L2 page, in pages */
  82
  83 static void build_p2m_frame_list_list(void);
  84 static void update_p2m_frame_list_list(void);
  85
  86 // #define PORT_DEBUG 4
  87 // #define EARLY_DEBUG_EVENT
  88
  89 int stipending(void);
  90 int
  91 stipending(void)
  92 {
  93         unsigned long l1;
  94         unsigned long l2;
  95         unsigned int l1i, l2i, port;
  96         volatile shared_info_t *s = HYPERVISOR_shared_info;
  97         struct cpu_info *ci;
  98         volatile struct vcpu_info *vci;
  99         int ret;
 100
 101         ret = 0;
 102         ci = curcpu();
 103         vci = ci->ci_vcpu;
 104
 105 #if 0
 106         if (HYPERVISOR_shared_info->events)
 107                 printf("stipending events %08lx mask %08lx ilevel %d\n",
 108                     HYPERVISOR_shared_info->events,
 109                     HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
 110 #endif
 111
 112 #ifdef EARLY_DEBUG_EVENT
 113         if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
 114                 xen_debug_handler(NULL);
 115                 xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
 116         }
 117 #endif
 118
 119         /*
 120          * we're only called after STIC, so we know that we'll have to
 121          * STI at the end
 122          */
 123         while (vci->evtchn_upcall_pending) {
 124                 cli();
 125                 vci->evtchn_upcall_pending = 0;
 126                 /* NB. No need for a barrier here -- XCHG is a barrier
 127                  * on x86. */
 128                 l1 = xen_atomic_xchg(&vci->evtchn_pending_sel, 0);
 129                 while ((l1i = xen_ffs(l1)) != 0) {
 130                         l1i--;
 131                         l1 &= ~(1UL << l1i);
 132
 133                         l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
 134                         /*
 135                          * mask and clear event. More efficient than calling
 136                          * hypervisor_mask/clear_event for each event.
 137                          */
 138                         xen_atomic_setbits_l(&s->evtchn_mask[l1i], l2);
 139                         xen_atomic_clearbits_l(&s->evtchn_pending[l1i], l2);
 140                         while ((l2i = xen_ffs(l2)) != 0) {
 141                                 l2i--;
 142                                 l2 &= ~(1UL << l2i);
 143
 144                                 port = (l1i << LONG_SHIFT) + l2i;
 145                                 if (evtsource[port]) {
 146                                         hypervisor_set_ipending(
 147                                             evtsource[port]->ev_imask,
 148                                             l1i, l2i);
 149                                         evtsource[port]->ev_evcnt.ev_count++;
 150                                         if (ret == 0 && ci->ci_ilevel <
 151                                             evtsource[port]->ev_maxlevel)
 152                                                 ret = 1;
 153                                 }
 154 #ifdef DOM0OPS
 155                                 else  {
 156                                         /* set pending event */
 157                                         xenevt_setipending(l1i, l2i);
 158                                 }
 159 #endif
 160                         }
 161                 }
 162                 sti();
 163         }
 164
 165 #if 0
 166         if (ci->ci_ipending & 0x1)
 167                 printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
 168                     HYPERVISOR_shared_info->events,
 169                     HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
 170                     ci->ci_ipending);
 171 #endif
 172
 173         return (ret);
 174 }
 175
 176 void
 177 do_hypervisor_callback(struct intrframe *regs)
 178 {
 179         unsigned long l1;
 180         unsigned long l2;
 181         unsigned int l1i, l2i, port;
 182         volatile shared_info_t *s = HYPERVISOR_shared_info;
 183         struct cpu_info *ci;
 184         volatile struct vcpu_info *vci;
 185         int level;
 186
 187         ci = curcpu();
 188         vci = ci->ci_vcpu;
 189         level = ci->ci_ilevel;
 190
 191         // DDD printf("do_hypervisor_callback\n");
 192
 193 #ifdef EARLY_DEBUG_EVENT
 194         if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
 195                 xen_debug_handler(NULL);
 196                 xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
 197         }
 198 #endif
 199
 200         while (vci->evtchn_upcall_pending) {
 201                 vci->evtchn_upcall_pending = 0;
 202                 /* NB. No need for a barrier here -- XCHG is a barrier
 203                  * on x86. */
 204                 l1 = xen_atomic_xchg(&vci->evtchn_pending_sel, 0);
 205                 while ((l1i = xen_ffs(l1)) != 0) {
 206                         l1i--;
 207                         l1 &= ~(1UL << l1i);
 208
 209                         l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
 210                         /*
 211                          * mask and clear the pending events.
 212                          * Doing it here for all event that will be processed
 213                          * avoids a race with stipending (which can be called
 214                          * though evtchn_do_event->splx) that could cause an
 215                          * event to be both processed and marked pending.
 216                          */
 217                         xen_atomic_setbits_l(&s->evtchn_mask[l1i], l2);
 218                         xen_atomic_clearbits_l(&s->evtchn_pending[l1i], l2);
 219
 220                         while ((l2i = xen_ffs(l2)) != 0) {
 221                                 l2i--;
 222                                 l2 &= ~(1UL << l2i);
 223
 224                                 port = (l1i << LONG_SHIFT) + l2i;
 225 #ifdef PORT_DEBUG
 226                                 if (port == PORT_DEBUG)
 227                                         printf("do_hypervisor_callback event %d\n", port);
 228 #endif
 229                                 if (evtsource[port])
 230                                         call_evtchn_do_event(port, regs);
 231 #ifdef DOM0OPS
 232                                 else  {
 233                                         if (ci->ci_ilevel < IPL_HIGH) {
 234                                                 /* fast path */
 235                                                 int oipl = ci->ci_ilevel;
 236                                                 ci->ci_ilevel = IPL_HIGH;
 237                                                 call_xenevt_event(port);
 238                                                 ci->ci_ilevel = oipl;
 239                                         } else {
 240                                                 /* set pending event */
 241                                                 xenevt_setipending(l1i, l2i);
 242                                         }
 243                                 }
 244 #endif
 245                         }
 246                 }
 247         }
 248
 249 #ifdef DIAGNOSTIC
 250         if (level != ci->ci_ilevel)
 251                 printf("hypervisor done %08x level %d/%d ipending %08x\n",
 252                     (uint)vci->evtchn_pending_sel,
 253                     level, ci->ci_ilevel, ci->ci_ipending);
 254 #endif
 255 }
 256
 257 void
 258 hypervisor_unmask_event(unsigned int ev)
 259 {
 260         volatile shared_info_t *s = HYPERVISOR_shared_info;
 261         volatile struct vcpu_info *vci = curcpu()->ci_vcpu;
 262
 263 #ifdef PORT_DEBUG
 264         if (ev == PORT_DEBUG)
 265                 printf("hypervisor_unmask_event %d\n", ev);
 266 #endif
 267
 268         xen_atomic_clear_bit(&s->evtchn_mask[0], ev);
 269         /*
 270          * The following is basically the equivalent of
 271          * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
 272          * interrupt edge' if the channel is masked.
 273          */
 274         if (xen_atomic_test_bit(&s->evtchn_pending[0], ev) &&
 275             !xen_atomic_test_and_set_bit(&vci->evtchn_pending_sel, ev>>LONG_SHIFT)) {
 276                 xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0);
 277                 if (!vci->evtchn_upcall_mask)
 278                         hypervisor_force_callback();
 279         }
 280 }
 281
 282 void
 283 hypervisor_mask_event(unsigned int ev)
 284 {
 285         volatile shared_info_t *s = HYPERVISOR_shared_info;
 286 #ifdef PORT_DEBUG
 287         if (ev == PORT_DEBUG)
 288                 printf("hypervisor_mask_event %d\n", ev);
 289 #endif
 290
 291         xen_atomic_set_bit(&s->evtchn_mask[0], ev);
 292 }
 293
 294 void
 295 hypervisor_clear_event(unsigned int ev)
 296 {
 297         volatile shared_info_t *s = HYPERVISOR_shared_info;
 298 #ifdef PORT_DEBUG
 299         if (ev == PORT_DEBUG)
 300                 printf("hypervisor_clear_event %d\n", ev);
 301 #endif
 302
 303         xen_atomic_clear_bit(&s->evtchn_pending[0], ev);
 304 }
 305
 306 void
 307 hypervisor_enable_ipl(unsigned int ipl)
 308 {
 309         u_long l1, l2;
 310         int l1i, l2i;
 311         struct cpu_info *ci = curcpu();
 312
 313         /*
 314          * enable all events for ipl. As we only set an event in ipl_evt_mask
 315          * for its lowest IPL, and pending IPLs are processed high to low,
 316          * we know that all callback for this event have been processed.
 317          */
 318
 319         l1 = ci->ci_isources[ipl]->ipl_evt_mask1;
 320         ci->ci_isources[ipl]->ipl_evt_mask1 = 0;
 321         while ((l1i = xen_ffs(l1)) != 0) {
 322                 l1i--;
 323                 l1 &= ~(1UL << l1i);
 324                 l2 = ci->ci_isources[ipl]->ipl_evt_mask2[l1i];
 325                 ci->ci_isources[ipl]->ipl_evt_mask2[l1i] = 0;
 326                 while ((l2i = xen_ffs(l2)) != 0) {
 327                         int evtch;
 328
 329                         l2i--;
 330                         l2 &= ~(1UL << l2i);
 331
 332                         evtch = (l1i << LONG_SHIFT) + l2i;
 333                         hypervisor_enable_event(evtch);
 334                 }
 335         }
 336 }
 337
 338 void
 339 hypervisor_set_ipending(uint32_t iplmask, int l1, int l2)
 340 {
 341         int ipl;
 342         struct cpu_info *ci = curcpu();
 343
 344         /* set pending bit for the appropriate IPLs */
 345         ci->ci_ipending |= iplmask;
 346
 347         /*
 348          * And set event pending bit for the lowest IPL. As IPL are handled
 349          * from high to low, this ensure that all callbacks will have been
 350          * called when we ack the event
 351          */
 352         ipl = ffs(iplmask);
 353         KASSERT(ipl > 0);
 354         ipl--;
 355         ci->ci_isources[ipl]->ipl_evt_mask1 |= 1UL << l1;
 356         ci->ci_isources[ipl]->ipl_evt_mask2[l1] |= 1UL << l2;
 357 }
 358
 359 void
 360 hypervisor_machdep_attach(void)
 361 {
 362         /* dom0 does not require the arch-dependent P2M translation table */
 363         if ( !xendomain_is_dom0() ) {
 364                 build_p2m_frame_list_list();
 365         }
 366 }
 367
 368 /*
 369  * Generate the p2m_frame_list_list table,
 370  * needed for guest save/restore
 371  */
 372 static void
 373 build_p2m_frame_list_list(void)
 374 {
 375         int fpp; /* number of page (frame) pointer per page */
 376         unsigned long max_pfn;
 377         /*
 378          * The p2m list is composed of three levels of indirection,
 379          * each layer containing MFNs pointing to lower level pages
 380          * The indirection is used to convert a given PFN to its MFN
 381          * Each N level page can point to @fpp (N-1) level pages
 382          * For example, for x86 32bit, we have:
 383          * - PAGE_SIZE: 4096 bytes
 384          * - fpp: 1024 (one L3 page can address 1024 L2 pages)
 385          * A L1 page contains the list of MFN we are looking for
 386          */
 387         max_pfn = xen_start_info.nr_pages;
 388         fpp = PAGE_SIZE / sizeof(paddr_t);
 389
 390         /* we only need one L3 page */
 391         l3_p2m_page = kmem_alloc(PAGE_SIZE, KM_NOSLEEP);
 392         if (l3_p2m_page == NULL)
 393                 panic("could not allocate memory for l3_p2m_page");
 394
 395         /*
 396          * Determine how many L2 pages we need for the mapping
 397          * Each L2 can map a total of @fpp L1 pages
 398          */
 399         l2_p2m_page_size = howmany(max_pfn, fpp);
 400
 401         l2_p2m_page = kmem_alloc(l2_p2m_page_size * PAGE_SIZE, KM_NOSLEEP);
 402         if (l2_p2m_page == NULL)
 403                 panic("could not allocate memory for l2_p2m_page");
 404
 405         /* We now have L3 and L2 pages ready, update L1 mapping */
 406         update_p2m_frame_list_list();
 407
 408 }
 409
 410 /*
 411  * Update the L1 p2m_frame_list_list mapping (during guest boot or resume)
 412  */
 413 static void
 414 update_p2m_frame_list_list(void)
 415 {
 416         int i;
 417         int fpp; /* number of page (frame) pointer per page */
 418         unsigned long max_pfn;
 419
 420         max_pfn = xen_start_info.nr_pages;
 421         fpp = PAGE_SIZE / sizeof(paddr_t);
 422
 423         for (i = 0; i < l2_p2m_page_size; i++) {
 424                 /*
 425                  * Each time we start a new L2 page,
 426                  * store its MFN in the L3 page
 427                  */
 428                 if ((i % fpp) == 0) {
 429                         l3_p2m_page[i/fpp] = vtomfn(
 430                                 (vaddr_t)&l2_p2m_page[i]);
 431                 }
 432                 /*
 433                  * we use a shortcut
 434                  * since @xpmap_phys_to_machine_mapping array
 435                  * already contains PFN to MFN mapping, we just
 436                  * set the l2_p2m_page MFN pointer to the MFN of the
 437                  * according frame of @xpmap_phys_to_machine_mapping
 438                  */
 439                 l2_p2m_page[i] = vtomfn((vaddr_t)
 440                         &xpmap_phys_to_machine_mapping[i*fpp]);
 441         }
 442
 443         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 444                                         vtomfn((vaddr_t)l3_p2m_page);
 445         HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
 446
 447 }