bug-hurd
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 1/6] add support for booting from grub with x86_64


From: Luca Dariz
Subject: [PATCH 1/6] add support for booting from grub with x86_64
Date: Sat, 5 Feb 2022 18:51:24 +0100

* configure: compile for native x86_64 by default instead of xen
* x86_64/Makefrag.am: introduce KERNEL_MAP_BASE to reuse the constant
  in both code and linker script
* x86_64/ldscript: use a .boot section for the very first operations,
  until we reach long mode. This section is not really allocated, so
  it doesn't need to be freed later. The vm system is later
  initialized starting from .text and not including .boot
* link kernel at 0x4000000 as the xen version, higher values causes
  linker errors
* we can't use full segmentation in long mode, so we need to create a
  temporary mapping during early boot to be able to jump to high
  addresses
* build direct map for first 4G in boothdr, it seems required by Linux
  drivers
* add INTEL_PTE_PS bit definition to enable 2MB pages during bootstrap
* ensure write bit is set in PDP entry access rights. This only
  applies to PAE-enabled kernels, mandatory for x86_64. On xen
  platform it seems to be handled differently

Signed-off-by: Luca Dariz <luca@orpolo.org>
---
 configure.ac          |   3 +-
 i386/configfrag.ac    |   2 +
 i386/i386/i386asm.sym |   1 +
 i386/i386/vm_param.h  |   2 +-
 i386/intel/pmap.c     |   4 +-
 i386/intel/pmap.h     |   1 +
 x86_64/Makefrag.am    |  18 +++-
 x86_64/boothdr.S      | 238 ++++++++++++++++++++++++++++++++++++++++++
 x86_64/ldscript       |  28 +++--
 9 files changed, 281 insertions(+), 16 deletions(-)
 create mode 100644 x86_64/boothdr.S

diff --git a/configure.ac b/configure.ac
index 019842db..3aaa935c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -56,8 +56,7 @@ case $host_platform:$host_cpu in
   default:i?86)
     host_platform=at;;
   default:x86_64)]
-    AC_MSG_WARN([Platform set to Xen by default, this can not boot on non-Xen 
systems, you currently need a 32bit build for that.])
-    [host_platform=xen;;
+    [host_platform=at;;
   at:i?86 | xen:i?86 | at:x86_64 | xen:x86_64)
     :;;
   *)]
diff --git a/i386/configfrag.ac b/i386/configfrag.ac
index f697e277..f07a98ca 100644
--- a/i386/configfrag.ac
+++ b/i386/configfrag.ac
@@ -106,6 +106,8 @@ AC_ARG_ENABLE([apic],
     enable_pae=${enable_pae-yes};;
   *:i?86)
     :;;
+  *:x86_64)
+    enable_pae=${enable_pae-yes};;
   *)
     if [ x"$enable_pae" = xyes ]; then]
       AC_MSG_ERROR([can only enable the `PAE' feature on ix86.])
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index 0662aea0..9e1d13d7 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -122,6 +122,7 @@ expr        sizeof(pt_entry_t)                              
PTE_SIZE
 expr   INTEL_PTE_PFN                                   PTE_PFN
 expr   INTEL_PTE_VALID                                 PTE_V
 expr   INTEL_PTE_WRITE                                 PTE_W
+expr   INTEL_PTE_PS                                    PTE_S
 expr   ~INTEL_PTE_VALID                                PTE_INVALID
 expr   NPTES                                           PTES_PER_PAGE
 expr   INTEL_PTE_VALID|INTEL_PTE_WRITE                 INTEL_PTE_KERNEL
diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h
index edd9522c..314fdb35 100644
--- a/i386/i386/vm_param.h
+++ b/i386/i386/vm_param.h
@@ -36,7 +36,7 @@
  * for better trace support in kdb; the _START symbol has to be offset by the
  * same amount. */
 #ifdef __x86_64__
-#define VM_MIN_KERNEL_ADDRESS  0x40000000UL
+#define VM_MIN_KERNEL_ADDRESS  KERNEL_MAP_BASE
 #else
 #define VM_MIN_KERNEL_ADDRESS  0xC0000000UL
 #endif
diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
index 3bf00659..d0bd3b5d 100644
--- a/i386/intel/pmap.c
+++ b/i386/intel/pmap.c
@@ -655,7 +655,7 @@ void pmap_bootstrap(void)
                                  pa_to_pte(_kvtophys((void *) kernel_page_dir
                                                      + i * INTEL_PGBYTES))
                                  | INTEL_PTE_VALID
-#ifdef MACH_PV_PAGETABLES
+#if !defined(MACH_HYP) || defined(MACH_PV_PAGETABLES)
                                  | INTEL_PTE_WRITE
 #endif
                                  );
@@ -1297,7 +1297,7 @@ pmap_t pmap_create(vm_size_t size)
                        WRITE_PTE(&p->pdpbase[i],
                                  pa_to_pte(kvtophys((vm_offset_t) page_dir[i]))
                                  | INTEL_PTE_VALID
-#ifdef MACH_PV_PAGETABLES
+#if !defined(MACH_HYP) || defined(MACH_PV_PAGETABLES)
                                  | INTEL_PTE_WRITE
 #endif
                                  );
diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
index f24b3a71..b93c4ad4 100644
--- a/i386/intel/pmap.h
+++ b/i386/intel/pmap.h
@@ -148,6 +148,7 @@ typedef phys_addr_t pt_entry_t;
 #define INTEL_PTE_NCACHE       0x00000010
 #define INTEL_PTE_REF          0x00000020
 #define INTEL_PTE_MOD          0x00000040
+#define INTEL_PTE_PS           0x00000080
 #ifdef MACH_PV_PAGETABLES
 /* Not supported */
 #define INTEL_PTE_GLOBAL       0x00000000
diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am
index 40b50bc9..5da734de 100644
--- a/x86_64/Makefrag.am
+++ b/x86_64/Makefrag.am
@@ -207,11 +207,27 @@ nodist_libkernel_a_SOURCES += \
 
 EXTRA_DIST += \
        x86_64/ldscript
+
 if PLATFORM_at
+# This should probably be 0xffffffff80000000 for mcmodel=kernel, but let's try
+# to stay in the first 8G first, otherwise we have to fix the pmap module to
+# actually use the l4 page level
+#KERNEL_MAP_BASE=0x100000000
+# but for nor try with < 4G, otherwise we have linker errors
+KERNEL_MAP_BASE=0x40000000
 gnumach_LINKFLAGS += \
        --defsym _START_MAP=$(_START_MAP) \
-       --defsym _START=_START_MAP+0x40000000 \
+       --defsym _START=_START_MAP \
+       --defsym KERNEL_MAP_BASE=$(KERNEL_MAP_BASE) \
        -T '$(srcdir)'/x86_64/ldscript
+
+AM_CFLAGS += -D_START_MAP=$(_START_MAP) \
+       -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE)
+AM_CCASFLAGS += -D_START_MAP=$(_START_MAP) \
+       -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE)
+
+AM_CCASFLAGS += \
+       -Ii386
 endif
 
 AM_CPPFLAGS += \
diff --git a/x86_64/boothdr.S b/x86_64/boothdr.S
new file mode 100644
index 00000000..12fc7ca2
--- /dev/null
+++ b/x86_64/boothdr.S
@@ -0,0 +1,238 @@
+/*
+ *  Copyright (C) 2022 Free Software Foundation
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <mach/machine/asm.h>
+
+#include <i386/i386asm.h>
+#include <i386/i386/proc_reg.h>
+#include <i386/i386/seg.h>
+       /*
+        * This section will be put first into .boot.  See also x86_64/ldscript.
+        */
+       .section .boot.text,"ax"
+       .globl boot_start
+
+       /* We should never be entered this way.  */
+       .code32
+boot_start:
+       jmp     boot_entry
+
+       /* MultiBoot header - see multiboot.h.  */
+#define        MULTIBOOT_MAGIC         0x1BADB002
+#ifdef __ELF__
+#define MULTIBOOT_FLAGS                0x00000003
+#else  /* __ELF__ */
+#define MULTIBOOT_FLAGS                0x00010003
+#endif /* __ELF__ */
+       P2ALIGN(2)
+boot_hdr:
+       .long   MULTIBOOT_MAGIC
+       .long   MULTIBOOT_FLAGS
+       /*
+       * The next item here is the checksum.
+       * XX this works OK until we need at least the 30th bit.
+       */
+       .long   - (MULTIBOOT_MAGIC+MULTIBOOT_FLAGS)
+#ifndef __ELF__        /* a.out kludge */
+       .long   boot_hdr        /* header_addr */
+       .long   _start          /* load_addr */
+       .long   _edata          /* load_end_addr */
+       .long   _end            /* bss_end_addr */
+       .long   boot_entry      /* entry */
+#endif /* __ELF__ */
+
+boot_entry:
+       /*
+        * Prepare minimal page mapping to jump to 64 bit and to C code.
+        * The first 4GB is identity mapped, and the first 2GB are re-mapped
+        * to high addresses at KERNEL_MAP_BASE
+        */
+
+       movl    $p3table,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p4table)
+       /*
+        * Fill 4 entries in L3 table to cover the whole 32-bit 4GB address
+        * space. Part of it might be remapped later if the kernel is mapped
+        * below 4G.
+        */
+       movl    $p2table,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3table)
+       movl    $p2table1,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3table + 8)
+       movl    $p2table2,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3table + 16)
+       movl    $p2table3,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3table + 24)
+       /* point each page table level two entry to a page */
+       mov     $0,%ecx
+.map_p2_table:
+       mov     $0x200000,%eax   // 2MiB page, should be always available
+       mul     %ecx
+       or      $(PTE_V|PTE_W|PTE_S),%eax  // enable 2MiB page instead of 4k
+       mov     %eax,p2table(,%ecx,8)
+       inc     %ecx
+       cmp     $2048,%ecx  // 512 entries per table, map 4 L2 tables
+       jne     .map_p2_table
+
+       /*
+        * KERNEL_MAP_BASE must me aligned to 2GB.
+        * Depending on kernel starting address, we might need to add another
+        * entry in the L4 table (controlling 512 GB chunks). In any case, we
+        * add two entries in L3 table to make sure we map 2GB for the kernel.
+        * Note that this may override part of the mapping create above.
+        */
+.kernel_map:
+#if KERNEL_MAP_BASE >= (1U << 39)
+       movl    $p3ktable,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p4table + (8 * ((KERNEL_MAP_BASE >> 39) & 0x1FF)))  // 
select 512G block
+       movl    $p2ktable1,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3ktable + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))  // 
select first 1G block
+       movl    $p2ktable2,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3ktable + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) 
))  // select second 1G block
+#else
+       movl    $p2ktable1,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3table + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))  // 
select first 1G block
+       movl    $p2ktable2,%eax
+       or      $(PTE_V|PTE_W),%eax
+       movl    %eax,(p3table + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) )) 
 // select second 1G block
+#endif
+
+       mov     $0,%ecx
+.map_p2k_table:
+       mov     $0x200000,%eax   // 2MiB page, should be always available
+       mul     %ecx
+       or      $(PTE_V|PTE_W|PTE_S),%eax  // enable 2MiB page instead of 4K
+       mov     %eax,p2ktable1(,%ecx,8)
+       inc     %ecx
+       cmp     $1024,%ecx  // 512 entries per table, map 2 L2 tables
+       jne     .map_p2k_table
+
+switch64:
+       /*
+        * Jump to 64 bit mode, we have to
+        * - enable PAE
+        * - enable long mode
+        * - enable paging and load the tables filled above in CR3
+        * - jump to a 64-bit code segment
+        */
+       mov     %cr4,%eax
+       or      $CR4_PAE,%eax
+       mov     %eax,%cr4
+       mov     $0xC0000080,%ecx  // select EFER register
+       rdmsr
+       or      $(1 << 8),%eax  // long mode enable bit
+       wrmsr
+       mov     $p4table,%eax
+       mov     %eax,%cr3
+       mov     %cr0,%eax
+       or      $CR0_PG,%eax
+       or      $CR0_WP,%eax
+       mov     %eax,%cr0
+
+       lgdt    gdt64pointer
+       movw    $0,%ax
+       movw    %ax,%fs
+       movw    %ax,%gs
+       movw    $16,%ax
+       movw    %ax,%ds
+       movw    %ax,%es
+       movw    %ax,%ss
+       ljmp    $8,$boot_entry64
+
+       .code64
+
+       /* why do we need this? it seems overwritten by linker */
+       .globl  _start
+_start:
+
+boot_entry64:
+       /* Switch to our own interrupt stack.  */
+       movq    $(_intstack+INTSTACK_SIZE),%rax
+       andq    $(~15),%rax
+       movq    %rax,%rsp
+
+       /* Reset EFLAGS to a known state.  */
+       pushq   $0
+       popf
+       /* save multiboot info for later */
+       movq    %rbx,%r8
+
+       /* Fix ifunc entries */
+       movq    $__rela_iplt_start,%rsi
+       movq    $__rela_iplt_end,%rdi
+iplt_cont:
+       cmpq    %rdi,%rsi
+       jae     iplt_done
+       movq    (%rsi),%rbx     /* r_offset */
+       movb    4(%rsi),%al     /* info */
+       cmpb    $42,%al         /* IRELATIVE */
+       jnz     iplt_next
+       call    *(%ebx)         /* call ifunc */
+       movq    %rax,(%rbx)     /* fixed address */
+iplt_next:
+       addq    $8,%rsi
+       jmp     iplt_cont
+iplt_done:
+
+       /* restore multiboot info */
+       movq    %r8,%rdi
+       /* Jump into C code.  */
+       call    EXT(c_boot_entry)
+       /* not reached */
+       nop
+
+       .section .boot.data
+       .comm   _intstack,INTSTACK_SIZE
+
+       .code32
+       .section .boot.data
+       .align 4096
+#define  SEG_ACCESS_OFS 40
+#define  SEG_GRANULARITY_OFS 52
+gdt64:
+       .quad   0
+gdt64code:
+        .quad  (ACC_P << SEG_ACCESS_OFS) | (ACC_CODE_R << SEG_ACCESS_OFS) | 
(SZ_64 << SEG_GRANULARITY_OFS)
+gdt64data:
+        .quad  (ACC_P << SEG_ACCESS_OFS) | (ACC_DATA_W << SEG_ACCESS_OFS)
+gdt64end:
+       .skip   (4096 - (gdt64end - gdt64))
+gdt64pointer:
+       .word   gdt64end - gdt64 - 1
+       .quad   gdt64
+
+       .section .boot.data
+       .align 4096
+p4table:       .space 4096
+p3table:       .space 4096
+p2table:       .space 4096
+p2table1:      .space 4096
+p2table2:      .space 4096
+p2table3:      .space 4096
+p3ktable:      .space 4096
+p2ktable1:      .space 4096
+p2ktable2:      .space 4096
diff --git a/x86_64/ldscript b/x86_64/ldscript
index 375e8104..de99795e 100644
--- a/x86_64/ldscript
+++ b/x86_64/ldscript
@@ -2,7 +2,7 @@
 OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64",
              "elf64-x86-64")
 OUTPUT_ARCH(i386:x86-64)
-ENTRY(_start)
+ENTRY(boot_start)
 SECTIONS
 {
   /*
@@ -11,22 +11,30 @@ SECTIONS
    * be first in there.  See also `i386/i386at/boothdr.S' and
    * `gnumach_LINKFLAGS' in `i386/Makefrag.am'.
    */
-  . = _START;
-  .text           :
-  AT (_START_MAP)
+
+  . = _START_MAP;
+  .boot           :
+  {
+    *(.boot.text)
+    *(.boot.data)
+  } =0x90909090
+
+  . += KERNEL_MAP_BASE;
+  _start = .;
+  .text           : AT(((ADDR(.text)) - KERNEL_MAP_BASE))
   {
-    *(.text.start)
+    *(.text*)
     *(.text .stub .text.* .gnu.linkonce.t.*)
     *(.text.unlikely .text.*_unlikely)
     KEEP (*(.text.*personality*))
     /* .gnu.warning sections are handled specially by elf32.em.  */
     *(.gnu.warning)
   } =0x90909090
-  .init           :
+  .init           : AT(((ADDR(.init)) - KERNEL_MAP_BASE))
   {
     KEEP (*(.init))
   } =0x90909090
-  .fini           :
+  .fini           : AT(((ADDR(.fini)) - KERNEL_MAP_BASE))
   {
     KEEP (*(.fini))
   } =0x90909090
@@ -69,7 +77,7 @@ SECTIONS
       PROVIDE_HIDDEN (__rela_iplt_end = .);
     }
   .plt            : { *(.plt) *(.iplt) }
-  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+  .rodata         :  AT(((ADDR(.rodata)) - KERNEL_MAP_BASE)) { *(.rodata 
.rodata.* .gnu.linkonce.r.*) }
   .rodata1        : { *(.rodata1) }
   .eh_frame_hdr : { *(.eh_frame_hdr) }
   .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) }
@@ -139,7 +147,7 @@ SECTIONS
   .got            : { *(.got) *(.igot) }
   . = DATA_SEGMENT_RELRO_END (24, .);
   .got.plt        : { *(.got.plt)  *(.igot.plt) }
-  .data           :
+  .data           : AT(((ADDR(.data)) - KERNEL_MAP_BASE))
   {
     *(.data .data.* .gnu.linkonce.d.*)
     SORT(CONSTRUCTORS)
@@ -147,7 +155,7 @@ SECTIONS
   .data1          : { *(.data1) }
   _edata = .; PROVIDE (edata = .);
   __bss_start = .;
-  .bss            :
+  .bss            : AT(((ADDR(.bss)) - KERNEL_MAP_BASE))
   {
    *(.dynbss)
    *(.bss .bss.* .gnu.linkonce.b.*)
-- 
2.30.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]