bug-hurd
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 1/6] add support for booting from grub with x86_64


From: Samuel Thibault
Subject: Re: [PATCH 1/6] add support for booting from grub with x86_64
Date: Sat, 27 Aug 2022 20:55:51 +0200
User-agent: NeoMutt/20170609 (1.8.3)

Luca Dariz, le sam. 05 févr. 2022 18:51:24 +0100, a ecrit:
> * configure: compile for native x86_64 by default instead of xen
> * x86_64/Makefrag.am: introduce KERNEL_MAP_BASE to reuse the constant
>   in both code and linker script
> * x86_64/ldscript: use a .boot section for the very first operations,
>   until we reach long mode. This section is not really allocated, so
>   it doesn't need to be freed later. The vm system is later
>   initialized starting from .text and not including .boot
> * link kernel at 0x4000000 as the xen version, higher values causes
>   linker errors
> * we can't use full segmentation in long mode, so we need to create a
>   temporary mapping during early boot to be able to jump to high
>   addresses
> * build direct map for first 4G in boothdr, it seems required by Linux
>   drivers
> * add INTEL_PTE_PS bit definition to enable 2MB pages during bootstrap
> * ensure write bit is set in PDP entry access rights. This only
>   applies to PAE-enabled kernels, mandatory for x86_64. On xen
>   platform it seems to be handled differently
> 
> Signed-off-by: Luca Dariz <luca@orpolo.org>

Applied, thanks!

> ---
>  configure.ac          |   3 +-
>  i386/configfrag.ac    |   2 +
>  i386/i386/i386asm.sym |   1 +
>  i386/i386/vm_param.h  |   2 +-
>  i386/intel/pmap.c     |   4 +-
>  i386/intel/pmap.h     |   1 +
>  x86_64/Makefrag.am    |  18 +++-
>  x86_64/boothdr.S      | 238 ++++++++++++++++++++++++++++++++++++++++++
>  x86_64/ldscript       |  28 +++--
>  9 files changed, 281 insertions(+), 16 deletions(-)
>  create mode 100644 x86_64/boothdr.S
> 
> diff --git a/configure.ac b/configure.ac
> index 019842db..3aaa935c 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -56,8 +56,7 @@ case $host_platform:$host_cpu in
>    default:i?86)
>      host_platform=at;;
>    default:x86_64)]
> -    AC_MSG_WARN([Platform set to Xen by default, this can not boot on 
> non-Xen systems, you currently need a 32bit build for that.])
> -    [host_platform=xen;;
> +    [host_platform=at;;
>    at:i?86 | xen:i?86 | at:x86_64 | xen:x86_64)
>      :;;
>    *)]
> diff --git a/i386/configfrag.ac b/i386/configfrag.ac
> index f697e277..f07a98ca 100644
> --- a/i386/configfrag.ac
> +++ b/i386/configfrag.ac
> @@ -106,6 +106,8 @@ AC_ARG_ENABLE([apic],
>      enable_pae=${enable_pae-yes};;
>    *:i?86)
>      :;;
> +  *:x86_64)
> +    enable_pae=${enable_pae-yes};;
>    *)
>      if [ x"$enable_pae" = xyes ]; then]
>        AC_MSG_ERROR([can only enable the `PAE' feature on ix86.])
> diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
> index 0662aea0..9e1d13d7 100644
> --- a/i386/i386/i386asm.sym
> +++ b/i386/i386/i386asm.sym
> @@ -122,6 +122,7 @@ expr      sizeof(pt_entry_t)                              
> PTE_SIZE
>  expr INTEL_PTE_PFN                                   PTE_PFN
>  expr INTEL_PTE_VALID                                 PTE_V
>  expr INTEL_PTE_WRITE                                 PTE_W
> +expr INTEL_PTE_PS                                    PTE_S
>  expr ~INTEL_PTE_VALID                                PTE_INVALID
>  expr NPTES                                           PTES_PER_PAGE
>  expr INTEL_PTE_VALID|INTEL_PTE_WRITE                 INTEL_PTE_KERNEL
> diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h
> index edd9522c..314fdb35 100644
> --- a/i386/i386/vm_param.h
> +++ b/i386/i386/vm_param.h
> @@ -36,7 +36,7 @@
>   * for better trace support in kdb; the _START symbol has to be offset by the
>   * same amount. */
>  #ifdef __x86_64__
> -#define VM_MIN_KERNEL_ADDRESS        0x40000000UL
> +#define VM_MIN_KERNEL_ADDRESS        KERNEL_MAP_BASE
>  #else
>  #define VM_MIN_KERNEL_ADDRESS        0xC0000000UL
>  #endif
> diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
> index 3bf00659..d0bd3b5d 100644
> --- a/i386/intel/pmap.c
> +++ b/i386/intel/pmap.c
> @@ -655,7 +655,7 @@ void pmap_bootstrap(void)
>                                 pa_to_pte(_kvtophys((void *) kernel_page_dir
>                                                     + i * INTEL_PGBYTES))
>                                 | INTEL_PTE_VALID
> -#ifdef       MACH_PV_PAGETABLES
> +#if !defined(MACH_HYP) || defined(MACH_PV_PAGETABLES)
>                                 | INTEL_PTE_WRITE
>  #endif
>                                 );
> @@ -1297,7 +1297,7 @@ pmap_t pmap_create(vm_size_t size)
>                       WRITE_PTE(&p->pdpbase[i],
>                                 pa_to_pte(kvtophys((vm_offset_t) page_dir[i]))
>                                 | INTEL_PTE_VALID
> -#ifdef       MACH_PV_PAGETABLES
> +#if !defined(MACH_HYP) || defined(MACH_PV_PAGETABLES)
>                                 | INTEL_PTE_WRITE
>  #endif
>                                 );
> diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
> index f24b3a71..b93c4ad4 100644
> --- a/i386/intel/pmap.h
> +++ b/i386/intel/pmap.h
> @@ -148,6 +148,7 @@ typedef phys_addr_t pt_entry_t;
>  #define INTEL_PTE_NCACHE     0x00000010
>  #define INTEL_PTE_REF                0x00000020
>  #define INTEL_PTE_MOD                0x00000040
> +#define INTEL_PTE_PS         0x00000080
>  #ifdef       MACH_PV_PAGETABLES
>  /* Not supported */
>  #define INTEL_PTE_GLOBAL     0x00000000
> diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am
> index 40b50bc9..5da734de 100644
> --- a/x86_64/Makefrag.am
> +++ b/x86_64/Makefrag.am
> @@ -207,11 +207,27 @@ nodist_libkernel_a_SOURCES += \
>  
>  EXTRA_DIST += \
>       x86_64/ldscript
> +
>  if PLATFORM_at
> +# This should probably be 0xffffffff80000000 for mcmodel=kernel, but let's 
> try
> +# to stay in the first 8G first, otherwise we have to fix the pmap module to
> +# actually use the l4 page level
> +#KERNEL_MAP_BASE=0x100000000
> +# but for nor try with < 4G, otherwise we have linker errors
> +KERNEL_MAP_BASE=0x40000000
>  gnumach_LINKFLAGS += \
>       --defsym _START_MAP=$(_START_MAP) \
> -     --defsym _START=_START_MAP+0x40000000 \
> +     --defsym _START=_START_MAP \
> +     --defsym KERNEL_MAP_BASE=$(KERNEL_MAP_BASE) \
>       -T '$(srcdir)'/x86_64/ldscript
> +
> +AM_CFLAGS += -D_START_MAP=$(_START_MAP) \
> +     -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE)
> +AM_CCASFLAGS += -D_START_MAP=$(_START_MAP) \
> +     -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE)
> +
> +AM_CCASFLAGS += \
> +     -Ii386
>  endif
>  
>  AM_CPPFLAGS += \
> diff --git a/x86_64/boothdr.S b/x86_64/boothdr.S
> new file mode 100644
> index 00000000..12fc7ca2
> --- /dev/null
> +++ b/x86_64/boothdr.S
> @@ -0,0 +1,238 @@
> +/*
> + *  Copyright (C) 2022 Free Software Foundation
> + *
> + * This program is free software ; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation ; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY ; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with the program ; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#include <mach/machine/asm.h>
> +
> +#include <i386/i386asm.h>
> +#include <i386/i386/proc_reg.h>
> +#include <i386/i386/seg.h>
> +     /*
> +      * This section will be put first into .boot.  See also x86_64/ldscript.
> +      */
> +     .section .boot.text,"ax"
> +     .globl boot_start
> +
> +     /* We should never be entered this way.  */
> +     .code32
> +boot_start:
> +     jmp     boot_entry
> +
> +     /* MultiBoot header - see multiboot.h.  */
> +#define      MULTIBOOT_MAGIC         0x1BADB002
> +#ifdef __ELF__
> +#define MULTIBOOT_FLAGS              0x00000003
> +#else  /* __ELF__ */
> +#define MULTIBOOT_FLAGS              0x00010003
> +#endif /* __ELF__ */
> +     P2ALIGN(2)
> +boot_hdr:
> +     .long   MULTIBOOT_MAGIC
> +     .long   MULTIBOOT_FLAGS
> +     /*
> +     * The next item here is the checksum.
> +     * XX this works OK until we need at least the 30th bit.
> +     */
> +     .long   - (MULTIBOOT_MAGIC+MULTIBOOT_FLAGS)
> +#ifndef __ELF__      /* a.out kludge */
> +     .long   boot_hdr        /* header_addr */
> +     .long   _start          /* load_addr */
> +     .long   _edata          /* load_end_addr */
> +     .long   _end            /* bss_end_addr */
> +     .long   boot_entry      /* entry */
> +#endif /* __ELF__ */
> +
> +boot_entry:
> +     /*
> +      * Prepare minimal page mapping to jump to 64 bit and to C code.
> +      * The first 4GB is identity mapped, and the first 2GB are re-mapped
> +      * to high addresses at KERNEL_MAP_BASE
> +      */
> +
> +     movl    $p3table,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p4table)
> +     /*
> +      * Fill 4 entries in L3 table to cover the whole 32-bit 4GB address
> +      * space. Part of it might be remapped later if the kernel is mapped
> +      * below 4G.
> +      */
> +     movl    $p2table,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3table)
> +     movl    $p2table1,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3table + 8)
> +     movl    $p2table2,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3table + 16)
> +     movl    $p2table3,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3table + 24)
> +     /* point each page table level two entry to a page */
> +     mov     $0,%ecx
> +.map_p2_table:
> +     mov     $0x200000,%eax   // 2MiB page, should be always available
> +     mul     %ecx
> +     or      $(PTE_V|PTE_W|PTE_S),%eax  // enable 2MiB page instead of 4k
> +     mov     %eax,p2table(,%ecx,8)
> +     inc     %ecx
> +     cmp     $2048,%ecx  // 512 entries per table, map 4 L2 tables
> +     jne     .map_p2_table
> +
> +     /*
> +      * KERNEL_MAP_BASE must me aligned to 2GB.
> +      * Depending on kernel starting address, we might need to add another
> +      * entry in the L4 table (controlling 512 GB chunks). In any case, we
> +      * add two entries in L3 table to make sure we map 2GB for the kernel.
> +      * Note that this may override part of the mapping create above.
> +      */
> +.kernel_map:
> +#if KERNEL_MAP_BASE >= (1U << 39)
> +     movl    $p3ktable,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p4table + (8 * ((KERNEL_MAP_BASE >> 39) & 0x1FF)))  // 
> select 512G block
> +     movl    $p2ktable1,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3ktable + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))  // 
> select first 1G block
> +     movl    $p2ktable2,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3ktable + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) 
> ))  // select second 1G block
> +#else
> +     movl    $p2ktable1,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3table + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))  // 
> select first 1G block
> +     movl    $p2ktable2,%eax
> +     or      $(PTE_V|PTE_W),%eax
> +     movl    %eax,(p3table + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) )) 
>  // select second 1G block
> +#endif
> +
> +     mov     $0,%ecx
> +.map_p2k_table:
> +     mov     $0x200000,%eax   // 2MiB page, should be always available
> +     mul     %ecx
> +     or      $(PTE_V|PTE_W|PTE_S),%eax  // enable 2MiB page instead of 4K
> +     mov     %eax,p2ktable1(,%ecx,8)
> +     inc     %ecx
> +     cmp     $1024,%ecx  // 512 entries per table, map 2 L2 tables
> +     jne     .map_p2k_table
> +
> +switch64:
> +     /*
> +      * Jump to 64 bit mode, we have to
> +      * - enable PAE
> +      * - enable long mode
> +      * - enable paging and load the tables filled above in CR3
> +      * - jump to a 64-bit code segment
> +      */
> +     mov     %cr4,%eax
> +     or      $CR4_PAE,%eax
> +     mov     %eax,%cr4
> +     mov     $0xC0000080,%ecx  // select EFER register
> +     rdmsr
> +     or      $(1 << 8),%eax  // long mode enable bit
> +     wrmsr
> +     mov     $p4table,%eax
> +     mov     %eax,%cr3
> +     mov     %cr0,%eax
> +     or      $CR0_PG,%eax
> +     or      $CR0_WP,%eax
> +     mov     %eax,%cr0
> +
> +     lgdt    gdt64pointer
> +     movw    $0,%ax
> +     movw    %ax,%fs
> +     movw    %ax,%gs
> +     movw    $16,%ax
> +     movw    %ax,%ds
> +     movw    %ax,%es
> +     movw    %ax,%ss
> +     ljmp    $8,$boot_entry64
> +
> +     .code64
> +
> +     /* why do we need this? it seems overwritten by linker */
> +       .globl  _start
> +_start:
> +
> +boot_entry64:
> +     /* Switch to our own interrupt stack.  */
> +     movq    $(_intstack+INTSTACK_SIZE),%rax
> +     andq    $(~15),%rax
> +     movq    %rax,%rsp
> +
> +     /* Reset EFLAGS to a known state.  */
> +     pushq   $0
> +     popf
> +     /* save multiboot info for later */
> +     movq    %rbx,%r8
> +
> +     /* Fix ifunc entries */
> +     movq    $__rela_iplt_start,%rsi
> +     movq    $__rela_iplt_end,%rdi
> +iplt_cont:
> +     cmpq    %rdi,%rsi
> +     jae     iplt_done
> +     movq    (%rsi),%rbx     /* r_offset */
> +     movb    4(%rsi),%al     /* info */
> +     cmpb    $42,%al         /* IRELATIVE */
> +     jnz     iplt_next
> +     call    *(%ebx)         /* call ifunc */
> +     movq    %rax,(%rbx)     /* fixed address */
> +iplt_next:
> +     addq    $8,%rsi
> +     jmp     iplt_cont
> +iplt_done:
> +
> +     /* restore multiboot info */
> +     movq    %r8,%rdi
> +     /* Jump into C code.  */
> +     call    EXT(c_boot_entry)
> +     /* not reached */
> +     nop
> +
> +     .section .boot.data
> +     .comm   _intstack,INTSTACK_SIZE
> +
> +     .code32
> +     .section .boot.data
> +     .align 4096
> +#define  SEG_ACCESS_OFS 40
> +#define  SEG_GRANULARITY_OFS 52
> +gdt64:
> +     .quad   0
> +gdt64code:
> +        .quad        (ACC_P << SEG_ACCESS_OFS) | (ACC_CODE_R << 
> SEG_ACCESS_OFS) | (SZ_64 << SEG_GRANULARITY_OFS)
> +gdt64data:
> +        .quad        (ACC_P << SEG_ACCESS_OFS) | (ACC_DATA_W << 
> SEG_ACCESS_OFS)
> +gdt64end:
> +     .skip   (4096 - (gdt64end - gdt64))
> +gdt64pointer:
> +     .word   gdt64end - gdt64 - 1
> +     .quad   gdt64
> +
> +     .section .boot.data
> +     .align 4096
> +p4table:     .space 4096
> +p3table:     .space 4096
> +p2table:     .space 4096
> +p2table1:    .space 4096
> +p2table2:    .space 4096
> +p2table3:    .space 4096
> +p3ktable:    .space 4096
> +p2ktable1:    .space 4096
> +p2ktable2:    .space 4096
> diff --git a/x86_64/ldscript b/x86_64/ldscript
> index 375e8104..de99795e 100644
> --- a/x86_64/ldscript
> +++ b/x86_64/ldscript
> @@ -2,7 +2,7 @@
>  OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64",
>             "elf64-x86-64")
>  OUTPUT_ARCH(i386:x86-64)
> -ENTRY(_start)
> +ENTRY(boot_start)
>  SECTIONS
>  {
>    /*
> @@ -11,22 +11,30 @@ SECTIONS
>     * be first in there.  See also `i386/i386at/boothdr.S' and
>     * `gnumach_LINKFLAGS' in `i386/Makefrag.am'.
>     */
> -  . = _START;
> -  .text           :
> -  AT (_START_MAP)
> +
> +  . = _START_MAP;
> +  .boot           :
> +  {
> +    *(.boot.text)
> +    *(.boot.data)
> +  } =0x90909090
> +
> +  . += KERNEL_MAP_BASE;
> +  _start = .;
> +  .text           : AT(((ADDR(.text)) - KERNEL_MAP_BASE))
>    {
> -    *(.text.start)
> +    *(.text*)
>      *(.text .stub .text.* .gnu.linkonce.t.*)
>      *(.text.unlikely .text.*_unlikely)
>      KEEP (*(.text.*personality*))
>      /* .gnu.warning sections are handled specially by elf32.em.  */
>      *(.gnu.warning)
>    } =0x90909090
> -  .init           :
> +  .init           : AT(((ADDR(.init)) - KERNEL_MAP_BASE))
>    {
>      KEEP (*(.init))
>    } =0x90909090
> -  .fini           :
> +  .fini           : AT(((ADDR(.fini)) - KERNEL_MAP_BASE))
>    {
>      KEEP (*(.fini))
>    } =0x90909090
> @@ -69,7 +77,7 @@ SECTIONS
>        PROVIDE_HIDDEN (__rela_iplt_end = .);
>      }
>    .plt            : { *(.plt) *(.iplt) }
> -  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
> +  .rodata         :  AT(((ADDR(.rodata)) - KERNEL_MAP_BASE)) { *(.rodata 
> .rodata.* .gnu.linkonce.r.*) }
>    .rodata1        : { *(.rodata1) }
>    .eh_frame_hdr : { *(.eh_frame_hdr) }
>    .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) }
> @@ -139,7 +147,7 @@ SECTIONS
>    .got            : { *(.got) *(.igot) }
>    . = DATA_SEGMENT_RELRO_END (24, .);
>    .got.plt        : { *(.got.plt)  *(.igot.plt) }
> -  .data           :
> +  .data           : AT(((ADDR(.data)) - KERNEL_MAP_BASE))
>    {
>      *(.data .data.* .gnu.linkonce.d.*)
>      SORT(CONSTRUCTORS)
> @@ -147,7 +155,7 @@ SECTIONS
>    .data1          : { *(.data1) }
>    _edata = .; PROVIDE (edata = .);
>    __bss_start = .;
> -  .bss            :
> +  .bss            : AT(((ADDR(.bss)) - KERNEL_MAP_BASE))
>    {
>     *(.dynbss)
>     *(.bss .bss.* .gnu.linkonce.b.*)
> -- 
> 2.30.2
> 
> 

-- 
Samuel
---
Pour une évaluation indépendante, transparente et rigoureuse !
Je soutiens la Commission d'Évaluation de l'Inria.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]