linux-brain/arch/x86/xen/p2m.c

529 lines
13 KiB
C
Raw Normal View History

/*
* Xen leaves the responsibility for maintaining p2m mappings to the
* guests themselves, but it must also access and update the p2m array
* during suspend/resume when all the pages are reallocated.
*
* The p2m table is logically a flat array, but we implement it as a
* three-level tree to allow the address space to be sparse.
*
* Xen
* |
* p2m_top p2m_top_mfn
* / \ / \
* p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
* / \ / \ / /
* p2m p2m p2m p2m p2m p2m p2m ...
*
* The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
*
* The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
* maximum representable pseudo-physical address space is:
* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
*
* P2M_PER_PAGE depends on the architecture, as a mfn is always
* unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
* 512 and 1024 entries respectively.
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/sched.h>
#include <asm/cache.h>
#include <asm/setup.h>
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include "xen-ops.h"
static void __init m2p_override_init(void);
unsigned long xen_max_p2m_pfn __read_mostly;
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
/* Placeholders for holes in the address space */
static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
static inline unsigned p2m_top_index(unsigned long pfn)
{
BUG_ON(pfn >= MAX_P2M_PFN);
return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
}
static inline unsigned p2m_mid_index(unsigned long pfn)
{
return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
static inline unsigned p2m_index(unsigned long pfn)
{
return pfn % P2M_PER_PAGE;
}
static void p2m_top_init(unsigned long ***top)
{
unsigned i;
for (i = 0; i < P2M_TOP_PER_PAGE; i++)
top[i] = p2m_mid_missing;
}
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
for (i = 0; i < P2M_TOP_PER_PAGE; i++)
top[i] = virt_to_mfn(p2m_mid_missing_mfn);
}
static void p2m_top_mfn_p_init(unsigned long **top)
{
unsigned i;
for (i = 0; i < P2M_TOP_PER_PAGE; i++)
top[i] = p2m_mid_missing_mfn;
}
static void p2m_mid_init(unsigned long **mid)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
mid[i] = p2m_missing;
}
static void p2m_mid_mfn_init(unsigned long *mid)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
mid[i] = virt_to_mfn(p2m_missing);
}
static void p2m_init(unsigned long *p2m)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
p2m[i] = INVALID_P2M_ENTRY;
}
/*
* Build the parallel p2m_top_mfn and p2m_mid_mfn structures
*
* This is called both at boot time, and after resuming from suspend:
* - At boot time we're called very early, and must use extend_brk()
* to allocate memory.
*
* - After resume we're called from within stop_machine, but the mfn
* tree should alreay be completely allocated.
*/
void xen_build_mfn_list_list(void)
{
unsigned long pfn;
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(p2m_mid_missing_mfn);
p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_p_init(p2m_top_mfn_p);
p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
p2m_mid_mfn_init(p2m_mid_missing_mfn);
}
for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
unsigned mididx = p2m_mid_index(pfn);
unsigned long **mid;
unsigned long *mid_mfn_p;
mid = p2m_top[topidx];
mid_mfn_p = p2m_top_mfn_p[topidx];
/* Don't bother allocating any mfn mid levels if
* they're just missing, just update the stored mfn,
* since all could have changed over a migrate.
*/
if (mid == p2m_mid_missing) {
BUG_ON(mididx);
BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
continue;
}
if (mid_mfn_p == p2m_mid_missing_mfn) {
/*
* XXX boot-time only! We should never find
* missing parts of the mfn tree after
* runtime. extend_brk() will BUG if we call
* it too late.
*/
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(mid_mfn_p);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
}
}
void xen_setup_mfn_list_list(void)
{
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(p2m_top_mfn);
HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
unsigned long pfn;
xen_max_p2m_pfn = max_pfn;
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_missing);
p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(p2m_mid_missing);
p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_init(p2m_top);
/*
* The domain builder gives us a pre-constructed p2m array in
* mfn_list for all the pages initially given to us, so we just
* need to graft that into our tree structure.
*/
for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
unsigned mididx = p2m_mid_index(pfn);
if (p2m_top[topidx] == p2m_mid_missing) {
unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(mid);
p2m_top[topidx] = mid;
}
xen: p2m: correctly initialize partial p2m leaf After changing the p2m mapping to a tree by commit 58e05027b530ff081ecea68e38de8d59db8f87e0 xen: convert p2m to a 3 level tree and trying to boot a DomU with 615MB of memory, the following crash was observed in the dump: kernel direct mapping tables up to 26f00000 @ 1ec4000-1fff000 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [<c0107397>] xen_set_pte+0x27/0x60 *pdpt = 0000000000000000 *pde = 0000000000000000 Adding further debug statements showed that when trying to set up pfn=0x26700 the returned mapping was invalid. pfn=0x266ff calling set_pte(0xc1fe77f8, 0x6b3003) pfn=0x26700 calling set_pte(0xc1fe7800, 0x3) Although the last_pfn obtained from the startup info is 0x26700, which should in turn not be hit, the additional 8MB which are added as extra memory normally seem to be ok. This lead to looking into the initial p2m tree construction, which uses the smaller value and assuming that there is other code handling the extra memory. When the p2m tree is set up, the leaves are directly pointed to the array which the domain builder set up. But if the mapping is not on a boundary that fits into one p2m page, this will result in the last leaf being only partially valid. And as the invalid entries are not initialized in that case, things go badly wrong. I am trying to fix that by checking whether the current leaf is a complete map and if not, allocate a completely new page and copy only the valid pointers there. This may not be the most efficient or elegant solution, but at least it seems to allow me booting DomUs with memory assignments all over the range. BugLink: http://bugs.launchpad.net/bugs/686692 [v2: Redid a bit of commit wording and fixed a compile warning] Signed-off-by: Stefan Bader <stefan.bader@canonical.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-01-20 23:38:23 +09:00
/*
* As long as the mfn_list has enough entries to completely
* fill a p2m page, pointing into the array is ok. But if
* not the entries beyond the last pfn will be undefined.
* And guessing that the 'what-ever-there-is' does not take it
* too kindly when changing it to invalid markers, a new page
* is allocated, initialized and filled with the valid part.
*/
if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
unsigned long p2midx;
unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m);
for (p2midx = 0; pfn + p2midx < max_pfn; p2midx++) {
p2m[p2midx] = mfn_list[pfn + p2midx];
}
p2m_top[topidx][mididx] = p2m;
} else
p2m_top[topidx][mididx] = &mfn_list[pfn];
}
m2p_override_init();
}
unsigned long get_phys_to_machine(unsigned long pfn)
{
unsigned topidx, mididx, idx;
if (unlikely(pfn >= MAX_P2M_PFN))
return INVALID_P2M_ENTRY;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
return p2m_top[topidx][mididx][idx];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
static void *alloc_p2m_page(void)
{
return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
}
static void free_p2m_page(void *p)
{
free_page((unsigned long)p);
}
/*
* Fully allocate the p2m structure for a given pfn. We need to check
* that both the top and mid levels are allocated, and make sure the
* parallel mfn tree is kept in sync. We may race with other cpus, so
* the new pages are installed with cmpxchg; if we lose the race then
* simply free the page we allocated and use the one that's there.
*/
static bool alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx;
unsigned long ***top_p, **mid;
unsigned long *top_mfn_p, *mid_mfn;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
top_p = &p2m_top[topidx];
mid = *top_p;
if (mid == p2m_mid_missing) {
/* Mid level is missing, allocate a new one */
mid = alloc_p2m_page();
if (!mid)
return false;
p2m_mid_init(mid);
if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
free_p2m_page(mid);
}
top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = p2m_top_mfn_p[topidx];
BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
if (mid_mfn == p2m_mid_missing_mfn) {
/* Separately check the mid mfn level */
unsigned long missing_mfn;
unsigned long mid_mfn_mfn;
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
return false;
p2m_mid_mfn_init(mid_mfn);
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
free_p2m_page(mid_mfn);
else
p2m_top_mfn_p[topidx] = mid_mfn;
}
if (p2m_top[topidx][mididx] == p2m_missing) {
/* p2m leaf page is missing */
unsigned long *p2m;
p2m = alloc_p2m_page();
if (!p2m)
return false;
p2m_init(p2m);
if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
free_p2m_page(p2m);
else
mid_mfn[mididx] = virt_to_mfn(p2m);
}
return true;
}
/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
unsigned topidx, mididx, idx;
if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
if (p2m_top[topidx][mididx] == p2m_missing)
return mfn == INVALID_P2M_ENTRY;
p2m_top[topidx][mididx][idx] = mfn;
return true;
}
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
return true;
}
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
if (!alloc_p2m(pfn))
return false;
if (!__set_phys_to_machine(pfn, mfn))
return false;
}
return true;
}
#define M2P_OVERRIDE_HASH_SHIFT 10
#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
static DEFINE_SPINLOCK(m2p_override_lock);
static void __init m2p_override_init(void)
{
unsigned i;
m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
sizeof(unsigned long));
for (i = 0; i < M2P_OVERRIDE_HASH; i++)
INIT_LIST_HEAD(&m2p_overrides[i]);
}
static unsigned long mfn_hash(unsigned long mfn)
{
return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
}
/* Add an MFN override for a particular page */
int m2p_add_override(unsigned long mfn, struct page *page)
{
unsigned long flags;
unsigned long pfn;
unsigned long address;
unsigned level;
pte_t *ptep = NULL;
pfn = page_to_pfn(page);
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
page->private = mfn;
page->index = pfn_to_mfn(pfn);
__set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
if (!PageHighMem(page))
/* Just zap old mapping for now */
pte_clear(&init_mm, address, ptep);
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
return 0;
}
int m2p_remove_override(struct page *page)
{
unsigned long flags;
unsigned long mfn;
unsigned long pfn;
unsigned long address;
unsigned level;
pte_t *ptep = NULL;
pfn = page_to_pfn(page);
mfn = get_phys_to_machine(pfn);
if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
return -EINVAL;
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_remove_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
spin_lock_irqsave(&m2p_override_lock, flags);
list_del(&page->lru);
spin_unlock_irqrestore(&m2p_override_lock, flags);
__set_phys_to_machine(pfn, page->index);
if (!PageHighMem(page))
set_pte_at(&init_mm, address, ptep,
pfn_pte(pfn, PAGE_KERNEL));
/* No tlb flush necessary because the caller already
* left the pte unmapped. */
return 0;
}
struct page *m2p_find_override(unsigned long mfn)
{
unsigned long flags;
struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
struct page *p, *ret;
ret = NULL;
spin_lock_irqsave(&m2p_override_lock, flags);
list_for_each_entry(p, bucket, lru) {
if (p->private == mfn) {
ret = p;
break;
}
}
spin_unlock_irqrestore(&m2p_override_lock, flags);
return ret;
}
unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
{
struct page *p = m2p_find_override(mfn);
unsigned long ret = pfn;
if (p)
ret = page_to_pfn(p);
return ret;
}
EXPORT_SYMBOL_GPL(m2p_find_override_pfn);