赞
踩
每个逻辑处理器都有自己的local apic,当guest读取local apic寄存器时将返回物理cpu的值,当guest写时将写入物理寄存器。例如可以通过ICR寄存器向其他cpu发送IPI.Local APIC虚拟化有两种方法:
(1) EPT机制管理MMIO 或MSR-bitmap方式
(2) Intel VT 的virtual-APIC方式; 本节将重点讨论第二钟方式
4.2.1 Local APIC 模块初始化
(1) 用户空间初始化
pc_new_cpu ==> apic_init(env,env->cpuid_apic_id) ==> qdev_create(NULL, "kvm-apic");
static TypeInfo kvm_apic_info= {
.name = "kvm-apic",
.parent = TYPE_APIC_COMMON,
.instance_size = sizeof(APICCommonState),
.class_init = kvm_apic_class_init,
};
static voidkvm_apic_init(APICCommonState *s) { //hw/apic.c
memory_region_init_io(&s->io_memory,&kvm_apic_io_ops, s, "kvm-apic-msi",
MSI_SPACE_SIZE);
}
用于处理msi中断的case, 5.2节将讨论msi interrupt.
apic_init_common(hw\apic-common.c) ==> sysbus_create_simple("kvmvapic", -1, NULL);
vapic_init(hw\kvm-vapic.c==> memory_region_init_io(&s->io, &vapic_ops, s,"kvmvapic", 2);)
sysbus_add_io(dev, VAPIC_IO_PORT,&s->io);
sysbus_init_ioports(dev,VAPIC_IO_PORT, 2); //处理port 0x7E的操作
(2) 内核空间初始化
vmx_create_vcpu ==》 kvm_vcpu_init ==》 kvm_arch_vcpu_init==》 kvm_create_lapic
a. 建立一个hrtimer,回调为apic_timer_fn
b. apic_base 默认设为0xfee00000
c. kvm_lapic_reset设置虚拟寄存器的default值
vmx_create_vcpu ==》
if(vm_need_virtualize_apic_accesses(kvm)) { // flexpriority_enabled默认为1
err = alloc_apic_access_page(kvm);
}
alloc_apic_access_page:
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); //单独用一个slot来管理apic accesspage
page = gfn_to_page(kvm, 0xfee00);
kvm->arch.apic_access_page = page;
(3) virtual-apic相关的VMCS寄存器设置
kvm_lapic_reset ==》
if(kvm_vcpu_is_bsp(vcpu))
kvm_lapic_set_base(vcpu,
vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
kvm_lapic_set_base ==》 kvm_x86_ops->set_virtual_x2apic_mode
vmx_set_virtual_x2apic_mode(structkvm_vcpu *vcpu, bool set) {
sec_exec_control =vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
if (set) {
sec_exec_control &=~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
} else {
sec_exec_control &=~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
}
local apic两种访问方式的选择memory 和msr; 本节将只分析memory方式
当SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE设为1时,通过msr来访问800h-8ffhlocal apic register; SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES为1时将启用apic-access page.
setup_vmcs_config ==> 设置如下bit
SECONDARY_EXEC_APIC_REGISTER_VIRT :启用virtual-page access访问local apic寄存器
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY: 对pending vinterrupt进行评估,允许后通过 guest-idt 提交
CPU_BASED_VM_EXEC_CONTROL寄存器CPU_BASED_TPR_SHADOW设为1,当guest访问apic-accesspage 80h时将访问到vptr.
PIN_BASED_VM_EXEC_CONTROL寄存器PIN_BASED_POSTED_INTR设为1,当处理器接收到通知的外部中断时不产生vm-exit,而是将post-interrupt descritpor复制到virtual-apic page内VIRR形成虚拟中短期内请求。
vmx_vcpu_reset ==>
a. kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
b. if(cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
if (vm_need_tpr_shadow(vmx->vcpu.kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
__pa(vmx->vcpu.arch.apic->regs));
vmcs_write32(TPR_THRESHOLD,0);
}
c. vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
virtual-apic page是apic-page的影子页面;当"virtual interruptdelivery"为1时80H,,B0H, VICR可以进行写访问,但不能读,其他寄存器读不可访问。
引入apic-page让处理器自动监控local apic
d. if(vmx_vm_has_apicv(vcpu->kvm))
memset(&vmx->pi_desc,0, sizeof(struct pi_desc));
当guest以线性地址访问apic-access page时,实际访问的是virtual-apic page;当以guest-physical访问apic-access page时产生vm-exit;
4.2.2 Local APIC access VM-Exit及其处理
(1) APIC_Base 的维护
由于host 的cpu和guest vcpu共享apic_base所以当vm-entry时需要重新恢复guestapic_base的值
用户空间:VM-Entry: kvm_arch_put_registers ==> kvm_put_apic ==>
kvm_put_apic_state(apic,&kapic);
return kvm_vcpu_ioctl(env, KVM_SET_LAPIC,&kapic);
内核空间:kvm_arch_vcpu_ioctl case KVM_SET_LAPIC
kvm_vcpu_ioctl_set_lapic ==》 kvm_apic_post_state_restore==> kvm_lapic_set_base
(2) VM-Exit for apic access
读写apic-access page越界或跨寄存器边界时会产生vm-exit;guest 以pha访问时直接发生vm-exit;写入local apic version, isr, tmr, irr等寄存器时。此时发生apic-access vm-exit
[EXIT_REASON_APIC_ACCESS] = handle_apic_access, //访问apic-access page产生
static int handle_apic_access(structkvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
unsigned long exit_qualification =vmcs_readl(EXIT_QUALIFICATION);
int access_type, offset;
access_type = exit_qualification & APIC_ACCESS_TYPE;
offset = exit_qualification & APIC_ACCESS_OFFSET;//apic-page内偏移
if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
(offset ==APIC_EOI)) {
kvm_lapic_set_eoi(vcpu); // call apic_reg_write(vcpu->arch.apic,APIC_EOI, 0);
skip_emulated_instruction(vcpu);
return 1;
}
}
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
emulate_instruction 会调用到read_write_emulator_ops ==》
vcpu_mmio_write ==》kvm_iodevice_write ==》
static const structkvm_io_device_ops apic_mmio_ops = {
.read = apic_mmio_read,
.write = apic_mmio_write,
};
apic_mmio_write ==》apic_reg_write(apic, offset & 0xff0, val);
static int apic_reg_write(struct kvm_lapic*apic, u32 reg, u32 val)
{ 。。。。。。
caseAPIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic))
kvm_apic_set_id(apic, val >> 24);
case APIC_EOI:
apic_set_eoi(apic);
break;
。。。。。。
}
static inline voidkvm_apic_set_id(struct kvm_lapic *apic, u8 id)
{
apic_set_reg(apic, APIC_ID, id << 24);
recalculate_apic_map(apic->vcpu->kvm);
}
static inline voidapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
*((u32 *) (apic->regs + reg_off)) = val; //写入到virtual-apic page对应位置
}
[EXIT_REASON_APIC_WRITE] = handle_apic_write, //写入apic-access page
handle_apic_write ==> kvm_apic_write_nodecode(vcpu,offset);
voidkvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
u32 val = 0;
offset &= 0xff0;
apic_reg_read(vcpu->arch.apic, offset, 4, &val);
apic_reg_write(vcpu->arch.apic, offset, val);
}
(3)主要寄存器的虚拟化
1. EOI
static intapic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
.......
apic_clear_isr(vector, apic); //call kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic)) = vmx_hwapic_isr_update
apic_update_ppr(apic);
kvm_ioapic_send_eoi(apic, vector);
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
return vector;
}
//设置VMCS guest intr 寄存器
static voidvmx_hwapic_isr_update(struct kvm *kvm, int isr)
{
.......
status = vmcs_read16(GUEST_INTR_STATUS);
old = status >> 8;
if (isr != old) {
status &= 0xff;
status |= isr << 8;
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
GUEST_INTR_STATUS 该寄存器分为两个8bit, RVI记录最高优先级的 virtual-interrupt向量号,
SVI记录正在执行的virtual-interrutp向量号; EOI命令完成后将返回到之前被中断的服务继续执行, 因此新的SVI 等于之前被中断的服务向量号. apic_find_highest_isr返回之前的最优先的中断向量号
2 TPR
apic_reg_write ==> case APIC_TASKPRI:
report_tpr_access(apic, true);
apic_set_tpr(apic, val & 0xff);
static void__report_tpr_access(struct kvm_lapic *apic, bool write)
{
......
kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
vcpu_enter_guest
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
回到用户态后会调用kvm_handle_tpr_access来处理vm-exit
static voidapic_set_tpr(struct kvm_lapic *apic, u32 tpr)
{
apic_set_reg(apic, APIC_TASKPRI, tpr);
apic_update_ppr(apic);
}
static voidapic_update_ppr(struct kvm_lapic *apic)
{
......
old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
if ((tpr & 0xf0) >= (isrv & 0xf0))
ppr = tpr & 0xff;
else
ppr = isrv & 0xf0;
if (old_ppr != ppr) {
apic_set_reg(apic, APIC_PROCPRI, ppr);
if (ppr < old_ppr)
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
}
ppr(process priorityregister)是只读寄存器,能触发它的是TPR, EOI和VM_Entry
其设置如下所示
if (VTPR[7:4] >= SVI[7:4]
VPPR = VPTR & 0xFF;
else
VPPR = SVI & 0XF0;
VPPR[31:8] = 0;
3. Self-IPI
apic_reg_write ==>
apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
apic_send_ipi(apic);
static voidapic_send_ipi(struct kvm_lapic *apic)
{
u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
irq.level = icr_low & APIC_INT_ASSERT;
irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
irq.shorthand = icr_low & APIC_SHORT_MASK;
if (apic_x2apic_mode(apic))
irq.dest_id = icr_high;
else
irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
trace_kvm_apic_ipi(icr_low, irq.dest_id);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq,NULL);
}
调用kvm_irq_delivery_to_apic完成irq的delivery, 下一节将分析该过程。
4.2.3 虚拟中断delivery
intkvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map)
{
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff&&
kvm_is_dm_lowest_prio(irq)) {
irq->delivery_mode = APIC_DM_FIXED;
}
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r,dest_map))
return r;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu)) //LAPIC enable
continue;
if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
irq->dest_id, irq->dest_mode)) //根据dest查找vcpu
continue;
if (!kvm_is_dm_lowest_prio(irq)) {
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_lapic_enabled(vcpu)) {
if (!lowest)
lowest = vcpu;
else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
lowest = vcpu;
}
}
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);
return r;
}
kvm_irq_delivery_to_apic_fast对于SELF-IPI立即调用kvm_apic_set_irq(src->vcpu,irq, dest_map);而不需要搜素了。否则则遍历vcpu, 接着判断virtual interrupt是否允许,允许条件如下: RVI[7:4] > VPPR[7:4]
int kvm_apic_set_irq(structkvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map)
{
struct kvm_lapic *apic = vcpu->arch.apic;
return __apic_accept_irq(apic, irq->delivery_mode,irq->vector,
irq->level, irq->trig_mode, dest_map);
}
__apic_accept_irq==> 本节分析2个case
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
case APIC_DM_FIXED: // delivery 由vector指定的irq到targeprocess
。。。。。。
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
if (kvm_x86_ops->deliver_posted_interrupt)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
apic_set_irr(vector, apic); //设置中断irr_pending = true
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
break;
case APIC_DM_STARTUP: //发送"start-up" IPI
result = 1;
apic->sipi_vector = vector;
/* make sure sipi_vector is visible for the receiver */
smp_wmb();
set_bit(KVM_APIC_SIPI, &apic->pending_events);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
break;
kvm_vcpu_kick(vcpu);让目标cpu调度执行
.deliver_posted_interrupt =vmx_deliver_posted_interrupt
static voidvmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
r = pi_test_and_set_on(&vmx->pi_desc);
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (!r && (vcpu->mode == IN_GUEST_MODE))
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
}
struct apic __read_mostly*apic = &apic_flat; //arch/x86/kernel/apic_flat_64.c
vcpu_enter_guest==>
if (kvm_check_request(KVM_REQ_EVENT,vcpu) || req_int_win) {
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
/* enable NMI/IRQ window open exits if needed */
else if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu); //enable_irq_window注入中断
if (kvm_lapic_enabled(vcpu)) {
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);//64bit cpu支持CR8访问TPR
kvm_lapic_sync_to_vapic(vcpu);
}
}
kvm_cpu_has_injectable_intr==>kvm_apic_has_interrupt
判断是否有apic中断irr_pending 为true
voidkvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
......
pe = xchg(&apic->pending_events, 0);
if (test_bit(KVM_APIC_INIT, &pe)) {
kvm_lapic_reset(vcpu);
kvm_vcpu_reset(vcpu);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
}
if (test_bit(KVM_APIC_SIPI, &pe) &&
vcpu->arch.mp_state== KVM_MP_STATE_INIT_RECEIVED) {
/* evaluatepending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id,sipi_vector);
kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
voidkvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
{
kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
cs.selector = vector << 8;
cs.base = vector << 12;
kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
kvm_rip_write(vcpu, 0); //更新RIP,到VM-Entry时将在新的rip执行
}
4.2.4 IOAPIC 虚拟化
(1) 初始化
kvm_ioapic_init ==>
a) kvm_ioapic_reset
b) kvm_iodevice_init(&ioapic->dev,&ioapic_mmio_ops);
c) kvm_io_bus_register_dev(kvm,KVM_MMIO_BUS, ioapic->base_address,
IOAPIC_MEM_LENGTH,&ioapic->dev);
static voidkvm_ioapic_reset(struct kvm_ioapic *ioapic)
{
for (i = 0; i < IOAPIC_NUM_PINS; i++)
ioapic->redirtbl[i].fields.mask = 1;
ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
ioapic->ioregsel = 0;
ioapic->irr = 0;
ioapic->id = 0;
rtc_irq_eoi_tracking_reset(ioapic);
update_handled_vectors(ioapic);
}
ioapic最重要的就是redirection table register.热redirtbl用于存储该寄存器. kvm_ioapic_redirect_entryredirtbl[IOAPIC_NUM_PINS];
unionkvm_ioapic_redirect_entry {
u64 bits;
struct {
u8 vector;
u8 delivery_mode:3;
u8 dest_mode:1;
u8 delivery_status:1;
u8 polarity:1;
u8 remote_irr:1;
u8 trig_mode:1;
u8 mask:1;
u8 reserve:7;
u8 reserved[4];
u8 dest_id;
} fields;
};
ioapic_mmio_ops负责虚拟化mmio操作
static const structkvm_io_device_ops ioapic_mmio_ops = {
.read =ioapic_mmio_read,
.write =ioapic_mmio_write,
};
kvm_vm_ioctl_set_irqchip ==> case KVM_IRQCHIP_IOAPIC ==>kvm_set_ioapic
调用kvm_ioapic_inject_all处理pending irq,
(2) set_irq
kvm_ioapic_set_irq ==> ioapic_set_irq(ioapic,irq, irq_level, line_status);
static intioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
int irq_level, bool line_status)
{
union kvm_ioapic_redirect_entry entry;
u32 mask = 1 << irq;
......
entry = ioapic->redirtbl[irq];
edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
........
old_irr = ioapic->irr;
ioapic->irr |= mask;
ret = ioapic_service(ioapic, irq, line_status);
.........
}
static intioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
a. 根据ioapic->redirtbl[irq] 给struct kvm_lapic_irq irqe;初始化
irqe.dest_id= entry->fields.dest_id;
irqe.vector = entry->fields.vector;
irqe.dest_mode = entry->fields.dest_mode;
irqe.trig_mode = entry->fields.trig_mode;
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
irqe.shorthand = 0;
b. 调用kvm_irq_delivery_to_apic(ioapic->kvm,NULL, &irqe, NULL); delivery中断
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。