linux-brain/drivers/kvm/mmu.c
Avi Kivity cea0f0e7ea [PATCH] KVM: MMU: Shadow page table caching
Define a hashtable for caching shadow page tables. Look up the cache on
context switch (cr3 change) or during page faults.

The key to the cache is a combination of
- the guest page table frame number
- the number of paging levels in the guest
   * we can cache real mode, 32-bit mode, pae, and long mode page
     tables simultaneously.  this is useful for smp bootup.
- the guest page table table
   * some kernels use a page as both a page table and a page directory.  this
     allows multiple shadow pages to exist for that page, one per level
- the "quadrant"
   * 32-bit mode page tables span 4MB, whereas a shadow page table spans
     2MB.  similarly, a 32-bit page directory spans 4GB, while a shadow
     page directory spans 1GB.  the quadrant allows caching up to 4 shadow page
     tables for one guest page in one level.
- a "metaphysical" bit
   * for real mode, and for pse pages, there is no guest page table, so set
     the bit to avoid write protecting the page.

Signed-off-by: Avi Kivity <avi@qumranet.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2007-01-05 23:55:24 -08:00

1020 lines
24 KiB
C

/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <linux/types.h>
#include <linux/string.h>
#include <asm/page.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include "vmx.h"
#include "kvm.h"
#define pgprintk(x...) do { printk(x); } while (0)
#define rmap_printk(x...) do { printk(x); } while (0)
#define ASSERT(x) \
if (!(x)) { \
printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
__FILE__, __LINE__, #x); \
}
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
#define PT32_PT_BITS 10
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
#define PT_WRITABLE_SHIFT 1
#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
#define PT_USER_MASK (1ULL << 2)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_MASK (1ULL << 5)
#define PT_DIRTY_MASK (1ULL << 6)
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_MASK (1ULL << 63)
#define PT_PAT_SHIFT 7
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
#define PT32_DIR_PSE36_SIZE 4
#define PT32_DIR_PSE36_SHIFT 13
#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#define PT32_PTE_COPY_MASK \
(PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
#define PT64_LEVEL_MASK(level) \
(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
#define PT64_INDEX(address, level)\
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define PT32_LEVEL_BITS 10
#define PT32_LEVEL_SHIFT(level) \
( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
#define PT32_LEVEL_MASK(level) \
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
#define PT32_INDEX(address, level)\
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
#define PT64_DIR_BASE_ADDR_MASK \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
#define RMAP_EXT 4
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
static int is_write_protection(struct kvm_vcpu *vcpu)
{
return vcpu->cr0 & CR0_WP_MASK;
}
static int is_cpuid_PSE36(void)
{
return 1;
}
static int is_present_pte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
}
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
static int is_io_pte(unsigned long pte)
{
return pte & PT_SHADOW_IO_MARK;
}
static int is_rmap_pte(u64 pte)
{
return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
}
/*
* Reverse mapping data structures:
*
* If page->private bit zero is zero, then page->private points to the
* shadow page table entry that points to page_address(page).
*
* If page->private bit zero is one, (then page->private & ~1) points
* to a struct kvm_rmap_desc containing more mappings.
*/
static void rmap_add(struct kvm *kvm, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
int i;
if (!is_rmap_pte(*spte))
return;
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
if (!page->private) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
page->private = (unsigned long)spte;
} else if (!(page->private & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = kzalloc(sizeof *desc, GFP_NOWAIT);
if (!desc)
BUG(); /* FIXME: return error */
desc->shadow_ptes[0] = (u64 *)page->private;
desc->shadow_ptes[1] = spte;
page->private = (unsigned long)desc | 1;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
if (desc->shadow_ptes[RMAP_EXT-1]) {
desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
if (!desc->more)
BUG(); /* FIXME: return error */
desc = desc->more;
}
for (i = 0; desc->shadow_ptes[i]; ++i)
;
desc->shadow_ptes[i] = spte;
}
}
static void rmap_desc_remove_entry(struct page *page,
struct kvm_rmap_desc *desc,
int i,
struct kvm_rmap_desc *prev_desc)
{
int j;
for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
;
desc->shadow_ptes[i] = desc->shadow_ptes[j];
desc->shadow_ptes[j] = 0;
if (j != 0)
return;
if (!prev_desc && !desc->more)
page->private = (unsigned long)desc->shadow_ptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
page->private = (unsigned long)desc->more | 1;
kfree(desc);
}
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
int i;
if (!is_rmap_pte(*spte))
return;
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
if (!page->private) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
} else if (!(page->private & 1)) {
rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
if ((u64 *)page->private != spte) {
printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
spte, *spte);
BUG();
}
page->private = 0;
} else {
rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
prev_desc = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
if (desc->shadow_ptes[i] == spte) {
rmap_desc_remove_entry(page, desc, i,
prev_desc);
return;
}
prev_desc = desc;
desc = desc->more;
}
BUG();
}
}
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
{
struct kvm_mmu_page *page_head = page_header(page_hpa);
list_del(&page_head->link);
page_head->page_hpa = page_hpa;
list_add(&page_head->link, &vcpu->free_pages);
}
static int is_empty_shadow_page(hpa_t page_hpa)
{
u32 *pos;
u32 *end;
for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
pos != end; pos++)
if (*pos != 0)
return 0;
return 1;
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
return gfn;
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
struct kvm_mmu_page *page;
if (list_empty(&vcpu->free_pages))
return NULL;
page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
list_del(&page->link);
list_add(&page->link, &vcpu->kvm->active_mmu_pages);
ASSERT(is_empty_shadow_page(page->page_hpa));
page->slot_bitmap = 0;
page->global = 1;
page->multimapped = 0;
page->parent_pte = parent_pte;
return page;
}
static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
int i;
if (!parent_pte)
return;
if (!page->multimapped) {
u64 *old = page->parent_pte;
if (!old) {
page->parent_pte = parent_pte;
return;
}
page->multimapped = 1;
pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
BUG_ON(!pte_chain);
INIT_HLIST_HEAD(&page->parent_ptes);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
pte_chain->parent_ptes[0] = old;
}
hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
continue;
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
if (!pte_chain->parent_ptes[i]) {
pte_chain->parent_ptes[i] = parent_pte;
return;
}
}
pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
BUG_ON(!pte_chain);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
pte_chain->parent_ptes[0] = parent_pte;
}
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
int i;
if (!page->multimapped) {
BUG_ON(page->parent_pte != parent_pte);
page->parent_pte = NULL;
return;
}
hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
if (!pte_chain->parent_ptes[i])
break;
if (pte_chain->parent_ptes[i] != parent_pte)
continue;
while (i + 1 < NR_PTE_CHAIN_ENTRIES) {
pte_chain->parent_ptes[i]
= pte_chain->parent_ptes[i + 1];
++i;
}
pte_chain->parent_ptes[i] = NULL;
return;
}
BUG();
}
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
gfn_t gfn)
{
unsigned index;
struct hlist_head *bucket;
struct kvm_mmu_page *page;
struct hlist_node *node;
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && !page->role.metaphysical) {
pgprintk("%s: found role %x\n",
__FUNCTION__, page->role.word);
return page;
}
return NULL;
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
unsigned level,
int metaphysical,
u64 *parent_pte)
{
union kvm_mmu_page_role role;
unsigned index;
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *page;
struct hlist_node *node;
role.word = 0;
role.glevels = vcpu->mmu.root_level;
role.level = level;
role.metaphysical = metaphysical;
if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
gfn, role.word);
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && page->role.word == role.word) {
mmu_page_add_parent_pte(page, parent_pte);
pgprintk("%s: found\n", __FUNCTION__);
return page;
}
page = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!page)
return page;
pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
page->gfn = gfn;
page->role = role;
hlist_add_head(&page->hash_link, bucket);
return page;
}
static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *parent_pte)
{
mmu_page_remove_parent_pte(page, parent_pte);
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
{
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
struct kvm_mmu_page *page_head = page_header(__pa(pte));
__set_bit(slot, &page_head->slot_bitmap);
}
hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
hpa_t hpa = gpa_to_hpa(vcpu, gpa);
return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
}
hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
struct kvm_memory_slot *slot;
struct page *page;
ASSERT((gpa & HPA_ERR_MASK) == 0);
slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
if (!slot)
return gpa | HPA_ERR_MASK;
page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
| (gpa & (PAGE_SIZE-1));
}
hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
if (gpa == UNMAPPED_GVA)
return UNMAPPED_GVA;
return gpa_to_hpa(vcpu, gpa);
}
static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
int level)
{
u64 *pos;
u64 *end;
ASSERT(vcpu);
ASSERT(VALID_PAGE(page_hpa));
ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
pos != end; pos++) {
u64 current_ent = *pos;
if (is_present_pte(current_ent)) {
if (level != 1)
release_pt_page_64(vcpu,
current_ent &
PT64_BASE_ADDR_MASK,
level - 1);
else
rmap_remove(vcpu->kvm, pos);
}
*pos = 0;
}
kvm_mmu_free_page(vcpu, page_hpa);
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
{
int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->mmu.root_hpa;
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
u64 *table;
u64 pte;
ASSERT(VALID_PAGE(table_addr));
table = __va(table_addr);
if (level == 1) {
pte = table[index];
if (is_present_pte(pte) && is_writeble_pte(pte))
return 0;
mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
PT_USER_MASK;
rmap_add(vcpu->kvm, &table[index]);
return 0;
}
if (table[index] == 0) {
struct kvm_mmu_page *new_table;
gfn_t pseudo_gfn;
pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
>> PAGE_SHIFT;
new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
v, level - 1,
1, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
return -ENOMEM;
}
table[index] = new_table->page_hpa | PT_PRESENT_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
}
table_addr = table[index] & PT64_BASE_ADDR_MASK;
}
}
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(VALID_PAGE(root));
vcpu->mmu.root_hpa = INVALID_PAGE;
return;
}
#endif
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->mmu.pae_root[i];
ASSERT(VALID_PAGE(root));
root &= PT64_BASE_ADDR_MASK;
vcpu->mmu.pae_root[i] = INVALID_PAGE;
}
vcpu->mmu.root_hpa = INVALID_PAGE;
}
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
int i;
gfn_t root_gfn;
root_gfn = vcpu->cr3 >> PAGE_SHIFT;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
root = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, 0, NULL)->page_hpa;
vcpu->mmu.root_hpa = root;
return;
}
#endif
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL)
root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
else if (vcpu->mmu.root_level == 0)
root_gfn = 0;
root = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
NULL)->page_hpa;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
}
static void nonpaging_flush(struct kvm_vcpu *vcpu)
{
hpa_t root = vcpu->mmu.root_hpa;
++kvm_stat.tlb_flush;
pgprintk("nonpaging_flush\n");
mmu_free_roots(vcpu);
mmu_alloc_roots(vcpu);
kvm_arch_ops->set_cr3(vcpu, root);
kvm_arch_ops->tlb_flush(vcpu);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
int ret;
gpa_t addr = gva;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
for (;;) {
hpa_t paddr;
paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
if (is_error_hpa(paddr))
return 1;
ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
if (ret) {
nonpaging_flush(vcpu);
continue;
}
break;
}
return ret;
}
static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
{
}
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
}
static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->mmu;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->inval_page = nonpaging_inval_page;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
return 0;
}
static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++kvm_stat.tlb_flush;
kvm_arch_ops->tlb_flush(vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
mmu_alloc_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
}
static void mark_pagetable_nonglobal(void *shadow_pte)
{
page_header(__pa(shadow_pte))->global = 0;
}
static inline void set_pte_common(struct kvm_vcpu *vcpu,
u64 *shadow_pte,
gpa_t gaddr,
int dirty,
u64 access_bits)
{
hpa_t paddr;
*shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
if (access_bits & PT_WRITABLE_MASK) {
struct kvm_mmu_page *shadow;
shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
if (shadow)
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
if (shadow)
access_bits &= ~PT_WRITABLE_MASK;
}
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
*shadow_pte |= access_bits;
paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
if (!(*shadow_pte & PT_GLOBAL_MASK))
mark_pagetable_nonglobal(shadow_pte);
if (is_error_hpa(paddr)) {
*shadow_pte |= gaddr;
*shadow_pte |= PT_SHADOW_IO_MARK;
*shadow_pte &= ~PT_PRESENT_MASK;
} else {
*shadow_pte |= paddr;
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
rmap_add(vcpu->kvm, shadow_pte);
}
}
static void inject_page_fault(struct kvm_vcpu *vcpu,
u64 addr,
u32 err_code)
{
kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
}
static inline int fix_read_pf(u64 *shadow_ent)
{
if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
!(*shadow_ent & PT_USER_MASK)) {
/*
* If supervisor write protect is disabled, we shadow kernel
* pages as user pages so we can trap the write access.
*/
*shadow_ent |= PT_USER_MASK;
*shadow_ent &= ~PT_WRITABLE_MASK;
return 1;
}
return 0;
}
static int may_access(u64 pte, int write, int user)
{
if (user && !(pte & PT_USER_MASK))
return 0;
if (write && !(pte & PT_WRITABLE_MASK))
return 0;
return 1;
}
/*
* Remove a shadow pte.
*/
static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
{
hpa_t page_addr = vcpu->mmu.root_hpa;
int level = vcpu->mmu.shadow_root_level;
++kvm_stat.invlpg;
for (; ; level--) {
u32 index = PT64_INDEX(addr, level);
u64 *table = __va(page_addr);
if (level == PT_PAGE_TABLE_LEVEL ) {
rmap_remove(vcpu->kvm, &table[index]);
table[index] = 0;
return;
}
if (!is_present_pte(table[index]))
return;
page_addr = table[index] & PT64_BASE_ADDR_MASK;
if (level == PT_DIRECTORY_LEVEL &&
(table[index] & PT_SHADOW_PS_MARK)) {
table[index] = 0;
release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
kvm_arch_ops->tlb_flush(vcpu);
return;
}
}
}
static void paging_free(struct kvm_vcpu *vcpu)
{
nonpaging_free(vcpu);
}
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
#define PTTYPE 32
#include "paging_tmpl.h"
#undef PTTYPE
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->mmu;
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->inval_page = paging_inval_page;
context->gva_to_gpa = paging64_gva_to_gpa;
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
return 0;
}
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->mmu;
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->inval_page = paging_inval_page;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
return 0;
}
static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
if (!is_paging(vcpu))
return nonpaging_init_context(vcpu);
else if (is_long_mode(vcpu))
return paging64_init_context(vcpu);
else if (is_pae(vcpu))
return paging32E_init_context(vcpu);
else
return paging32_init_context(vcpu);
}
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
if (VALID_PAGE(vcpu->mmu.root_hpa)) {
vcpu->mmu.free(vcpu);
vcpu->mmu.root_hpa = INVALID_PAGE;
}
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
destroy_kvm_mmu(vcpu);
return init_kvm_mmu(vcpu);
}
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
while (!list_empty(&vcpu->free_pages)) {
struct kvm_mmu_page *page;
page = list_entry(vcpu->free_pages.next,
struct kvm_mmu_page, link);
list_del(&page->link);
__free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
page->page_hpa = INVALID_PAGE;
}
free_page((unsigned long)vcpu->mmu.pae_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
struct page *page;
int i;
ASSERT(vcpu);
for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
INIT_LIST_HEAD(&page_header->link);
if ((page = alloc_page(GFP_KERNEL)) == NULL)
goto error_1;
page->private = (unsigned long)page_header;
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
}
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
* 4GB of memory, which happens to fit the DMA32 zone.
*/
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
if (!page)
goto error_1;
vcpu->mmu.pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->mmu.pae_root[i] = INVALID_PAGE;
return 0;
error_1:
free_mmu_pages(vcpu);
return -ENOMEM;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(list_empty(&vcpu->free_pages));
return alloc_mmu_pages(vcpu);
}
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(!list_empty(&vcpu->free_pages));
return init_kvm_mmu(vcpu);
}
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *page;
list_for_each_entry(page, &kvm->active_mmu_pages, link) {
int i;
u64 *pt;
if (!test_bit(slot, &page->slot_bitmap))
continue;
pt = __va(page->page_hpa);
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (pt[i] & PT_WRITABLE_MASK) {
rmap_remove(kvm, &pt[i]);
pt[i] &= ~PT_WRITABLE_MASK;
}
}
}