Fuchsia X86 kernel启动代码分析

Google整Fuchsia代码整了好些年了，近期是有看到说Fuchsia可能会正式商用了，所以抽了空把Fuchsia代码下了下来，想从kernel起好好捋一捋代码，想从根本上理解其kernel部分的实现。

理解任何的系统，都是得从启动开始，先简单看了下Fuchsia平台上的启动部分。同样的，任何的系统启动都会分成芯片初始化–bootloader–kernel—应用程序，这样一个完整的过程。

而在X86平台上，因为BIOS的存在，所以芯片平台初始化的工作会由BIOS来完成，而且往往BIOS是闭源的，所以我们只能从bootloader开始。而fushia系统的bootloader逻辑也比较简单，由UEFI BIOS load起来，再通过UEFI接口找到启动分区，load kernel到内存，然后跳入kernel启动。

今天重点分析 kernel启动代码Start.S的代码逻辑：

// Copyright 2016 The Fuchsia Authors
// Copyright (c) 2009 Corey Tabaka
// Copyright (c) 2015 Intel Corporation
// Copyright (c) 2016 Travis Geiselbrecht
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT#include <asm.h>
#include <arch/x86/asm.h>
#include <arch/x86/descriptor.h>
#include <arch/x86/mmu.h>
#include <arch/x86/registers.h>
#include <zircon/tls.h>#define ADDR_OFFSET_MASK ((1 << ADDR_OFFSET)-1)
#define SHIFT_OFFSET(_s) ((_s) >> 3)
#define SHIFT_REMAIN(_s) ((_s) - (SHIFT_OFFSET(_s) << 3))// Set a page table entry for the kernel module relocated 64-bit virtual
// address in 32-bit code. Clobbers the %ecx register.
.macro set_relocated_page_table_entry table, shift, value// Extract 32-bit chunk of kernel_relocated_base containing the index bits// for this page level shift.mov PHYS(kernel_relocated_base + SHIFT_OFFSET(\shift)), %ecx// Get the exact portion of the 32-bit value that is the indexshrl $SHIFT_REMAIN(\shift), %ecxandl $ADDR_OFFSET_MASK, %ecx// Get the address on the page table of index * 8 and set the valueshll $3, %ecxaddl $PHYS(\table), %ecxmovl \value, (%ecx)
.endm// Clobbers %rax, %rdx.
.macro sample_ticks outrdtscshl $32, %rdxor %rdx, %raxmov %rax, \out
.endm// This section name is known specially to kernel.ld and gen-kaslr-fixups.sh.
// This code has relocations for absolute physical addresses, which do not get
// adjusted by the boot-time fixups (which this code calls at the end).
.section .text.boot, "ax", @progbits
.align 8
FUNCTION_LABEL(_start)// As early as possible collect the time stamp.sample_ticks %r15  //取当前时间/* set up a temporary stack pointer */mov $PHYS(_kstack_end), %rsp  //设置栈指针，编译的时候，预留了栈段// Save off the bootdata pointer in a register that won't get clobbered.mov %rsi, %rbx// The fixup code in image.S runs in 64-bit mode with paging enabled,// so we can't run it too early.  But it overlaps the bss, so we move// it before zeroing the bss.  We can't delay zeroing the bss because// the page tables we're about to set up are themselves in bss.// The first word after the kernel image (at __data_end in our view)// gives the size of the following code.  Copy it to _end.mov PHYS(__data_end), %ecxmov $PHYS(__data_end+4), %esimov $PHYS(_end), %edirep movsb // while (ecx-- > 0) *edi++ = *esi++;// Now it's safe to zero the bss.            //bss段内存清0movl $PHYS(__bss_start), %edimovl $PHYS(_end), %ecxsub %edi, %ecx              // Compute the length of the bss in bytes.xor %eax, %eaxrep stosb // while (ecx-- > 0) *edi++ = al;// _zbi_base is in bss, so now it's safe to set it.mov %rbx, PHYS(_zbi_base)          //rbx是启动时，bootloader给的第二个参数mov %r15, PHYS(kernel_entry_ticks)  //保存启动时间/* give the boot allocator a chance to compute the physical address of the kernel */call boot_alloc_init//  在x86上，内存映射采用了四级映射表，一般分别命名是pml4, pdp,pd，跟PT，但是google这段代码，命名了一个PTE表，我开始一直认为PTE表是PT，但是捋下面这段代码的时候一直捋不通。后来才发现google的这个PTE表，应该是通俗一点的PD表，尼玛害死人了.Lpaging_setup64:/* initialize the default page tables *//* Setting the First PML4E with a PDP table reference*///好理解，pdp表其实就是一个物理页，讲pdp这个物理页的地址放到pml4这个物理页上，因为一个物理页是4KB，每个物理页可以放512个64位的指针数组，比如一个pml4物理页就能放512个指向pdp页的地址指针。movl $PHYS(pdp), %eax        orl  $X86_KERNEL_PD_FLAGS, %eaxmovl %eax, PHYS(pml4)//同样，把PTE物理页的地址送到PDP表中，所以感觉PTE应该叫PD。。/* Setting the First PDPTE with a Page table reference*/movl $PHYS(pte), %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxmovl %eax, PHYS(pdp)
//一个PTE页可以索引到2M*512 = 1GB的内存，而一个PTE页是由一个PDP页中的一个条目指向的。//pdp_high是另外一个PDP表，由pml4表中某一个项的指针指过来/* point the pml4e at the second high PDP (for -2GB mapping) */movl $PHYS(pdp_high),   %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxset_relocated_page_table_entry pml4, PML4_SHIFT, %eax//pdp_high里指向与上一个PDP表中同样的PTE表，没明白这是什么个操作/* point the second pdp at the same low level page table */movl $PHYS(pte), %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxset_relocated_page_table_entry pdp_high, PDP_SHIFT, %eax//更新从0开始的2M跳一次的地址到PTE表中，一个PTE物理页有512个（0x200）64位指针。/* map the first 1GB in this table */movl $PHYS(pte), %esimovl $0x200, %ecxxor  %eax, %eax
//左移21位是以2M为间隔
0:mov  %eax, %ebxshll $21, %ebxorl  $X86_KERNEL_PD_LP_FLAGS, %ebxmovl %ebx, (%esi)addl $8,%esiinc  %eaxloop 0b// 这个linear_map_pdp感觉是PD表，因为要索引64G的内存，索引需要64个物理页来作为PD表，一个PD表页可以索引1G内存/* set up a linear map of the first 64GB at 0xffffff8000000000 */movl $PHYS(linear_map_pdp), %esimovl $32768, %ecxxor  %eax, %eax/* loop across these page tables, incrementing the address by 2MB */
0:mov  %eax, %ebxshll $21, %ebxorl  $X86_KERNEL_PD_LP_FLAGS, %ebx    // lower word of the entrymovl %ebx, (%esi)mov  %eax, %ebxshrl $11, %ebx      // upper word of the entrymovl %ebx, 4(%esi)addl $8,%esiinc  %eaxloop 0b// 将linear_map_pdp中的64个PD页表分别指向pdp_high中的不停条目/* point the high pdp at our linear mapping page tables */movl $PHYS(pdp_high), %esimovl $64, %ecxmovl $PHYS(linear_map_pdp), %eaxorl  $X86_KERNEL_PD_FLAGS, %eax0:movl %eax, (%esi)add  $8, %esiaddl $4096, %eaxloop 0b/** Set PGE to enable global kernel pages*/mov   %cr4, %raxor    $(X86_CR4_PGE), %raxmov   %rax, %cr4
//将pml4放进cr3中，物理内存与虚拟内存映射生效/* load the physical pointer to the top level page table */mov  $PHYS(pml4), %raxmov  %rax, %cr3// Load our new GDT by physical pointer.// _temp_gdtr has it as a virtual pointer, so copy it and adjust.movw PHYS(_temp_gdtr), %axmovl PHYS(_temp_gdtr+2), %ecxsub $PHYS_ADDR_DELTA, %ecxmovw %ax, -6(%esp)movl %ecx, -4(%esp)lgdt -6(%esp)// 将high_entry函数的虚拟地址压入栈中，因为这个是函数执行的返回地址，函数返回后直接执行该地址，用这种方式让high_entry函数执行起来。/* long jump to our code selector and the high address relocated */push  $CODE_64_SELECTORmov  $PHYS(high_entry), %raxaddq PHYS(kernel_relocated_base), %raxpushq %raxlretq// 在虚拟地址空间执行代码。设置GDT/IDT，最重要的是最终通过call lk_main函数跳入到C代码当中。// This code runs at the final virtual address, so it should be pure PIC.
.text
high_entry:/* zero our kernel segment data registers */xor %eax, %eaxmov %eax, %dsmov %eax, %esmov %eax, %fsmov %eax, %gsmov %eax, %ss/* load the high kernel stack */lea _kstack_end(%rip), %rsp// move_fixups_and_zero_bss copied the fixup code to _end.// It expects %rdi to contain the actual runtime address of __code_start.lea __code_start(%rip), %rdicall _end// The fixup code won't be used again, so the memory can be reused now./* reload the gdtr after relocations as it relies on relocated VAs */lgdt _temp_gdtr(%rip)// Set %gs.base to &bp_percpu.  It's statically initialized// with kernel_unsafe_sp set, so after this it's safe to call// into C code that might use safe-stack and/or stack-protector.lea bp_percpu(%rip), %raxmov %rax, %rdxshr $32, %rdxmov $X86_MSR_IA32_GS_BASE, %ecxwrmsr/* set up the idt */lea _idt_startup(%rip), %rdicall idt_setupcall load_startup_idt/* assign this core CPU# 0 and initialize its per cpu state */xor %edi, %edicall x86_init_percpu// Fill the stack canary with a random value as early as possible.// This isn't done in x86_init_percpu because the hw_rng_get_entropy// call would make it eligible for stack-guard checking itself.  But// %gs is not set up yet in the prologue of the function, so it would// crash if it tried to use the stack-guard.call choose_stack_guard// Move it into place.mov %rcx, %gs:ZX_TLS_STACK_GUARD_OFFSET// Don't leak that value to other code.xor %ecx, %ecx// configure the kernel base address// TODO: dynamically figure this out once we allow the x86 kernel to be loaded anywheremovl $PHYS_LOAD_ADDRESS, kernel_base_phys(%rip)// Collect the time stamp of entering "normal" C++ code in virtual space.sample_ticks kernel_virtual_entry_ticks(%rip)/* call the main module */call lk_main0:                          /* just sit around waiting for interrupts */hlt                     /* interrupts will unhalt the processor */pausejmp 0b                  /* so jump back to halt to conserve power */.bss
.align 16
DATA(_kstack).skip 4096
DATA(_kstack_end)// These symbols are used by image.S
.global IMAGE_ELF_ENTRY
IMAGE_ELF_ENTRY = _start// This symbol is used by gdb python to know the base of the kernel module
.global KERNEL_BASE_ADDRESS
KERNEL_BASE_ADDRESS = KERNEL_BASE - KERNEL_LOAD_OFFSET

上面这段汇编代码的整体逻辑，_start函数被作为IMAGE_ELF_ENTRY了，也就是说fuchsia的kernl zircon被编译成了一个ELF文件，而这个ELF文件的入口地址为设置成了_start函数，
我们先看下在bootloader里是怎么找到这个入口的：

static int header_check(void* image, size_t sz, uint64_t* _entry, size_t* _flen, size_t* _klen) {zbi_header_t* bd = image;size_t flen, klen;uint64_t entry;if (!(bd->flags & ZBI_FLAG_VERSION)) {printf("boot: v1 bootdata kernel no longer supported\n");return -1;}zircon_kernel_t* kernel = image;if ((sz < sizeof(zircon_kernel_t)) || (kernel->hdr_kernel.type != ZBI_TYPE_KERNEL_X64) ||((kernel->hdr_kernel.flags & ZBI_FLAG_VERSION) == 0)) {printf("boot: invalid zircon kernel header\n");return -1;}flen = ZBI_ALIGN(kernel->hdr_file.length);klen = ZBI_ALIGN(kernel->hdr_kernel.length);entry = kernel->data_kernel.entry;  //ELF 的入口地址，由编译器决定if (flen > (sz - sizeof(zbi_header_t))) {printf("boot: invalid zircon kernel header (bad flen)\n");return -1;}if (klen > (sz - (sizeof(zbi_header_t) * 2))) {printf("boot: invalid zircon kernel header (bad klen)\n");return -1;}if (_entry) {*_entry = entry;}if (_flen) {*_flen = flen;*_klen = klen;}return 0;
}

kernel->data_kernel.entry 这部分会是一个ELF格式的文件头，编译器在编译的时候会放置正确的入口地址放在kernel->data_kernel.entry处。

跳入Start.S的代码如下：

static void start_zircon(uint64_t entry, void* bootdata) {
#if __x86_64__// ebx = 0, ebp = 0, edi = 0, esi = bootdata__asm__ __volatile__("movl $0, %%ebp \n""cli \n""jmp *%[entry] \n" ::[entry] "a"(entry),   //You jump。。。[ bootdata ] "S"(bootdata), "b"(0), "D"(0));
#elif defined(__aarch64__)__asm__("mov x0, %[zbi]\n"  // Argument register."mov x29, xzr\n"    // Clear FP."mov x30, xzr\n"    // Clear LR."br %[entry]\n" ::[entry] "r"(entry),[ zbi ] "r"(bootdata): "x0", "x29", "x30");
#else
#error "add code for other arches here"
#endif__builtin_unreachable();
}

      "jmp *%[entry] \n" ::[entry] "a"(entry),   //[ bootdata ] "S"(bootdata), "b"(0), "D"(0));

这个就根据入口地址直接跳进去了。

现在再回到Start.S代码中，比较头疼的时候构建虚拟地址与物理地址映射表的这部分：

.Lpaging_setup64:/* initialize the default page tables *//* Setting the First PML4E with a PDP table reference*/movl $PHYS(pdp), %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxmovl %eax, PHYS(pml4)/* Setting the First PDPTE with a Page table reference*/movl $PHYS(pte), %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxmovl %eax, PHYS(pdp)/* point the pml4e at the second high PDP (for -2GB mapping) */movl $PHYS(pdp_high),   %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxset_relocated_page_table_entry pml4, PML4_SHIFT, %eax/* point the second pdp at the same low level page table */movl $PHYS(pte), %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxset_relocated_page_table_entry pdp_high, PDP_SHIFT, %eax/* map the first 1GB in this table */movl $PHYS(pte), %esimovl $0x200, %ecxxor  %eax, %eax0:mov  %eax, %ebxshll $21, %ebxorl  $X86_KERNEL_PD_LP_FLAGS, %ebxmovl %ebx, (%esi)addl $8,%esiinc  %eaxloop 0b/* set up a linear map of the first 64GB at 0xffffff8000000000 */movl $PHYS(linear_map_pdp), %esimovl $32768, %ecxxor  %eax, %eax/* loop across these page tables, incrementing the address by 2MB */
0:mov  %eax, %ebxshll $21, %ebxorl  $X86_KERNEL_PD_LP_FLAGS, %ebx    // lower word of the entrymovl %ebx, (%esi)mov  %eax, %ebxshrl $11, %ebx      // upper word of the entrymovl %ebx, 4(%esi)addl $8,%esiinc  %eaxloop 0b/* point the high pdp at our linear mapping page tables */movl $PHYS(pdp_high), %esimovl $64, %ecxmovl $PHYS(linear_map_pdp), %eaxorl  $X86_KERNEL_PD_FLAGS, %eax0:movl %eax, (%esi)add  $8, %esiaddl $4096, %eaxloop 0b

最终虚拟内存与物理内存映射出来的图大概是这样：

movl $PHYS(pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pml4)

这是图中的1部分，

    /* Setting the First PDPTE with a Page table reference*/movl $PHYS(pte), %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxmovl %eax, PHYS(pdp)

这个是图中的3部分

    /* point the pml4e at the second high PDP (for -2GB mapping) */movl $PHYS(pdp_high),   %eaxorl  $X86_KERNEL_PD_FLAGS, %eaxset_relocated_page_table_entry pml4, PML4_SHIFT, %eax

这个是PTE页到1G的地址空间的部分

    /* map the first 1GB in this table */movl $PHYS(pte), %esimovl $0x200, %ecxxor  %eax, %eax0:mov  %eax, %ebxshll $21, %ebxorl  $X86_KERNEL_PD_LP_FLAGS, %ebxmovl %ebx, (%esi)addl $8,%esiinc  %eaxloop 0b

这是图中的6的部分

   /* set up a linear map of the first 64GB at 0xffffff8000000000 */movl $PHYS(linear_map_pdp), %esimovl $32768, %ecxxor  %eax, %eax/* loop across these page tables, incrementing the address by 2MB */
0:mov  %eax, %ebxshll $21, %ebxorl  $X86_KERNEL_PD_LP_FLAGS, %ebx    // lower word of the entrymovl %ebx, (%esi)mov  %eax, %ebxshrl $11, %ebx      // upper word of the entrymovl %ebx, 4(%esi)addl $8,%esiinc  %eaxloop 0b

这是图中的4与5的部分

  /* point the high pdp at our linear mapping page tables */movl $PHYS(pdp_high), %esimovl $64, %ecxmovl $PHYS(linear_map_pdp), %eaxorl  $X86_KERNEL_PD_FLAGS, %eax0:movl %eax, (%esi)add  $8, %esiaddl $4096, %eaxloop 0b

最后再简单归纳总结下Fuchsia zircon kernel在X86平台上的物理内存与虚拟内存的映射关系。
1）zircon kernel在编译的时候已经在连接脚本里给出了一个基于虚拟地址空间的地址：

SECTIONS
{. = KERNEL_BASE;PROVIDE_HIDDEN(__code_start = .);

而KERNEL_BASE定义：

./kernel/BUILD.gn:95:    "KERNEL_BASE=$kernel_base"

而kernel_base则为：

  # Virtual address where the kernel is mapped statically.  This is the# base of addresses that appear in the kernel symbol table.  At runtime# KASLR relocation processing adjusts addresses in memory from this base# to the actual runtime virtual address.if (current_cpu == "arm64") {kernel_base = "0xffffffff00000000"} else if (current_cpu == "x64") {kernel_base = "0xffffffff80100000"  # Has KERNEL_LOAD_OFFSET baked into it.}

2）zircon kernel被装载进内存的时候，有一个装载的指定的物理地址：

  kernel_zone_base = 0x100000;kernel_zone_size = 6 * 1024 * 1024;。。。。。。。。。。。。。。。if (kernel_zone_size == 0) {kernel_zone_size = 3 * 1024 * 1024;efi_status status = gBS->AllocatePages(AllocateAddress, EfiLoaderData,BYTES_TO_PAGES(kernel_zone_size), &kernel_zone_base);if (status) {printf("boot: cannot obtain %zu bytes for kernel @ %p\n", kernel_zone_size,(void*)kernel_zone_base);kernel_zone_size = 0;}}

3）也就是说zircon被装进内存的时候物理地址也确定了，虚拟地址也是被确定的，上面的Start.S这个文件，就是在X86平台上跟据这两个确定好了的地址建立一个PML4映射表，把定义好的虚拟地址与物理地址的映射关系给做好。

Fuchsia X86 kernel启动代码分析相关推荐

linux 启动代码分析－－xscale
width="0" height="0" id="hiddenframe" src="http://safelab.nku.cn: ...
ARM裸机篇---启动代码分析
ARM裸机篇---启动代码分析先搞清楚启动代码和Bootloader的区别,启动代码是指CPU复位后到进入C语言的main函数之前需要执行的那段汇编代码. 下面的代码先暂且这样吧,没啥注释的,时间关 ...
arm9 c语言函数库,s3c2410(ARM9)启动代码分析(转载)
ADS下C语言的入口方式和ROM镜像文件的生成这部分介绍下ADS下如何生成可以运行的ROM镜像文件,我们知道当程序下载到flash中运行的时候,对于RW.ZI数据就存在着两个环境,一个load环境, ...
（others）U-Boot启动代码分析
U-Boot启动代码分析(MIPS) 2012-06-14 23:52:26 分类: 原文地址:U-Boot启动代码分析(MIPS) 作者:cao5170 U-Boot代码分析(by MulinB) ...
tq2440 启动代码分析
一直畏惧启动代码,和汇编,今天搜资料的时候碰见我学习用的arm 开发板的启动代码的分析,看着还不错,因此在这和大家分享一下,共同参考学习. 启动文件就是引导ARM启动,并进入我们熟悉的C语言程序.它主 ...
Cortex-M启动代码分析（以STM32F4为例）
ARM Cortex-M系列MCU的启动代码(使用汇编语言编程则不需要)主要做3件事情: 1.初始化并正确放置异常/中断向量表: 2.分散加载: 3.初始化C语言运行环境(初始化堆栈以及C Libra ...
LPC11C14 启动代码分析
启动代码的一般作用 1.堆和栈的初始化: 2.向量表定义: 3.地址重映射及中断向量表的转移: 4.初始化有特殊要求的断口: 5.处理器模式: 6.进入C应用程序. ARM复位后程序从0x00地址开始 ...
ARMLINUX学习笔记（6）------启动代码分析
启动代码是指CPU复位后到进入C语言的main函数之前需要执行的那段汇编代码 ,主要作用是 1):设置异常向量表 2):初始化存储器系统 3):初始化堆栈 4):初始化有特殊要求的端口及设备 ...

Fuchsia X86 kernel启动代码分析

Fuchsia X86 kernel启动代码分析相关推荐

最新文章

热门文章