[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 1/6] add support for booting from grub with x86_64
From: |
Luca Dariz |
Subject: |
[PATCH 1/6] add support for booting from grub with x86_64 |
Date: |
Fri, 28 Jan 2022 19:24:04 +0100 |
* link kernel at 0x4000000 as the xen version, higher values causes
linker errors.
* we can't use full segmentation in long mode, so we need to create a
temporary mapping during early boot to be able to jump to high
addresses
* build direct map for first 4G in boothdr (seems required by Linux
drivers
* enable also write page access check in kernel mode
Signed-off-by: Luca Dariz <luca@orpolo.org>
---
configure.ac | 3 +-
i386/configfrag.ac | 2 +
i386/i386/vm_param.h | 4 +-
i386/intel/pmap.c | 4 +-
i386/intel/pmap.h | 4 +
x86_64/Makefrag.am | 18 +++-
x86_64/boothdr.S | 214 +++++++++++++++++++++++++++++++++++++++++++
x86_64/interrupt.S | 4 +-
x86_64/ldscript | 28 ++++--
x86_64/locore.S | 4 +-
10 files changed, 264 insertions(+), 21 deletions(-)
create mode 100644 x86_64/boothdr.S
diff --git a/configure.ac b/configure.ac
index 019842db..3aaa935c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -56,8 +56,7 @@ case $host_platform:$host_cpu in
default:i?86)
host_platform=at;;
default:x86_64)]
- AC_MSG_WARN([Platform set to Xen by default, this can not boot on non-Xen
systems, you currently need a 32bit build for that.])
- [host_platform=xen;;
+ [host_platform=at;;
at:i?86 | xen:i?86 | at:x86_64 | xen:x86_64)
:;;
*)]
diff --git a/i386/configfrag.ac b/i386/configfrag.ac
index f697e277..f07a98ca 100644
--- a/i386/configfrag.ac
+++ b/i386/configfrag.ac
@@ -106,6 +106,8 @@ AC_ARG_ENABLE([apic],
enable_pae=${enable_pae-yes};;
*:i?86)
:;;
+ *:x86_64)
+ enable_pae=${enable_pae-yes};;
*)
if [ x"$enable_pae" = xyes ]; then]
AC_MSG_ERROR([can only enable the `PAE' feature on ix86.])
diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h
index edd9522c..c00c05b2 100644
--- a/i386/i386/vm_param.h
+++ b/i386/i386/vm_param.h
@@ -36,7 +36,7 @@
* for better trace support in kdb; the _START symbol has to be offset by the
* same amount. */
#ifdef __x86_64__
-#define VM_MIN_KERNEL_ADDRESS 0x40000000UL
+#define VM_MIN_KERNEL_ADDRESS KERNEL_MAP_BASE
#else
#define VM_MIN_KERNEL_ADDRESS 0xC0000000UL
#endif
@@ -73,7 +73,7 @@
/* This is the kernel address range in linear addresses. */
#ifdef __x86_64__
#define LINEAR_MIN_KERNEL_ADDRESS VM_MIN_KERNEL_ADDRESS
-#define LINEAR_MAX_KERNEL_ADDRESS (0x00007fffffffffffUL)
+#define LINEAR_MAX_KERNEL_ADDRESS (0xffffffffffffffffUL)
#else
/* On x86, the kernel virtual address space is actually located
at high linear addresses. */
diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
index 3bf00659..91835b30 100644
--- a/i386/intel/pmap.c
+++ b/i386/intel/pmap.c
@@ -655,7 +655,7 @@ void pmap_bootstrap(void)
pa_to_pte(_kvtophys((void *) kernel_page_dir
+ i * INTEL_PGBYTES))
| INTEL_PTE_VALID
-#ifdef MACH_PV_PAGETABLES
+#if defined(MACH_PV_PAGETABLES) || defined(__x86_64__)
| INTEL_PTE_WRITE
#endif
);
@@ -1297,7 +1297,7 @@ pmap_t pmap_create(vm_size_t size)
WRITE_PTE(&p->pdpbase[i],
pa_to_pte(kvtophys((vm_offset_t) page_dir[i]))
| INTEL_PTE_VALID
-#ifdef MACH_PV_PAGETABLES
+#if defined(MACH_PV_PAGETABLES) || defined(__x86_64__)
| INTEL_PTE_WRITE
#endif
);
diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
index f24b3a71..d9222e95 100644
--- a/i386/intel/pmap.h
+++ b/i386/intel/pmap.h
@@ -156,7 +156,11 @@ typedef phys_addr_t pt_entry_t;
#endif /* MACH_PV_PAGETABLES */
#define INTEL_PTE_WIRED 0x00000200
#ifdef PAE
+#ifdef __x86_64__
+#define INTEL_PTE_PFN 0xfffffffffffff000ULL
+#else /* __x86_64__ */
#define INTEL_PTE_PFN 0x00007ffffffff000ULL
+#endif/* __x86_64__ */
#else
#define INTEL_PTE_PFN 0xfffff000
#endif
diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am
index 40b50bc9..5da734de 100644
--- a/x86_64/Makefrag.am
+++ b/x86_64/Makefrag.am
@@ -207,11 +207,27 @@ nodist_libkernel_a_SOURCES += \
EXTRA_DIST += \
x86_64/ldscript
+
if PLATFORM_at
+# This should probably be 0xffffffff80000000 for mcmodel=kernel, but let's try
+# to stay in the first 8G first, otherwise we have to fix the pmap module to
+# actually use the l4 page level
+#KERNEL_MAP_BASE=0x100000000
+# but for nor try with < 4G, otherwise we have linker errors
+KERNEL_MAP_BASE=0x40000000
gnumach_LINKFLAGS += \
--defsym _START_MAP=$(_START_MAP) \
- --defsym _START=_START_MAP+0x40000000 \
+ --defsym _START=_START_MAP \
+ --defsym KERNEL_MAP_BASE=$(KERNEL_MAP_BASE) \
-T '$(srcdir)'/x86_64/ldscript
+
+AM_CFLAGS += -D_START_MAP=$(_START_MAP) \
+ -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE)
+AM_CCASFLAGS += -D_START_MAP=$(_START_MAP) \
+ -DKERNEL_MAP_BASE=$(KERNEL_MAP_BASE)
+
+AM_CCASFLAGS += \
+ -Ii386
endif
AM_CPPFLAGS += \
diff --git a/x86_64/boothdr.S b/x86_64/boothdr.S
new file mode 100644
index 00000000..3375c6c9
--- /dev/null
+++ b/x86_64/boothdr.S
@@ -0,0 +1,214 @@
+
+#include <mach/machine/asm.h>
+
+#include <i386/i386asm.h>
+ /*
+ * This section will be put first into .boot. See also x86_64/ldscript.
+ */
+ .section .boot.text,"ax"
+ .globl boot_start
+
+ /* We should never be entered this way. */
+ .code32
+boot_start:
+ jmp boot_entry
+
+ /* MultiBoot header - see multiboot.h. */
+#define MULTIBOOT_MAGIC 0x1BADB002
+#ifdef __ELF__
+#define MULTIBOOT_FLAGS 0x00000003
+#else /* __ELF__ */
+#define MULTIBOOT_FLAGS 0x00010003
+#endif /* __ELF__ */
+ P2ALIGN(2)
+boot_hdr:
+ .long MULTIBOOT_MAGIC
+ .long MULTIBOOT_FLAGS
+ /*
+ * The next item here is the checksum.
+ * XX this works OK until we need at least the 30th bit.
+ */
+ .long - (MULTIBOOT_MAGIC+MULTIBOOT_FLAGS)
+#ifndef __ELF__ /* a.out kludge */
+ .long boot_hdr /* header_addr */
+ .long _start /* load_addr */
+ .long _edata /* load_end_addr */
+ .long _end /* bss_end_addr */
+ .long boot_entry /* entry */
+#endif /* __ELF__ */
+
+boot_entry:
+ /*
+ * Prepare minimal page mapping to jump to 64 bit and to C code.
+ * The first 4GB is identity mapped, and the first 2GB are re-mapped
+ * to high addresses at KERNEL_MAP_BASE
+ */
+
+ movl $p3table,%eax
+ or $0b11,%eax
+ movl %eax,(p4table)
+ /*
+ * Fill 4 entries to cover the whole 32-bit 4GB address space, just to
+ * be sure. Part of it might be remapped later if the kernel is mapped
+ * below 4G.
+ */
+ movl $p2table,%eax
+ or $0b11,%eax
+ movl %eax,(p3table)
+ movl $p2table1,%eax
+ or $0b11,%eax
+ movl %eax,(p3table + 8)
+ movl $p2table2,%eax
+ or $0b11,%eax
+ movl %eax,(p3table + 16)
+ movl $p2table3,%eax
+ or $0b11,%eax
+ movl %eax,(p3table + 24)
+ /* point each page table level two entry to a page */
+ mov $0,%ecx
+.map_p2_table:
+ mov $0x200000,%eax // 2MiB page, should be always available
+ mul %ecx
+ or $0b10000011,%eax
+ mov %eax,p2table(,%ecx,8)
+ inc %ecx
+ cmp $2048,%ecx
+ jne .map_p2_table
+
+ /* KERNEL_MAP_BASE must me aligned to 2GB */
+.kernel_map:
+#if KERNEL_MAP_BASE >= (1U << 39)
+ movl $p3ktable,%eax
+ or $0b11,%eax
+ movl %eax,(p4table + (8 * ((KERNEL_MAP_BASE >> 39) & 0x1FF))) //
entry for 0b111111111 mask
+ movl $p2ktable1,%eax
+ or $0b11,%eax
+ movl %eax,(p3ktable + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) )) //
entry for 0b111111110 mask
+ movl $p2ktable2,%eax
+ or $0b11,%eax
+ movl %eax,(p3ktable + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1)
)) // entry for 0b111111111 mask
+#else
+ movl $p2ktable1,%eax
+ or $0b11,%eax
+ movl %eax,(p3table + (8 * ((KERNEL_MAP_BASE >> 30) & 0x1FF) ))
+ movl $p2ktable2,%eax
+ or $0b11,%eax
+ movl %eax,(p3table + (8 * (((KERNEL_MAP_BASE >> 30) & 0x1FF) + 1) ))
+#endif
+
+ mov $0,%ecx
+.map_p2k_table:
+ mov $0x200000,%eax // 2MiB page
+ mul %ecx
+ or $0b10000011,%eax
+ mov %eax,p2ktable1(,%ecx,8)
+ inc %ecx
+ cmp $1024,%ecx
+ jne .map_p2k_table
+
+switch64:
+ /*
+ * Jump to 64 bit code, we have to
+ * - enable PAE
+ * - enable long mode
+ * - enable paging by loading the tables filled above in $cr3
+ * - jump to a 64-bit code segment
+ */
+ mov %cr4,%eax
+ or $(1 << 5),%eax // PAE bit
+ mov %eax,%cr4
+ mov $0xC0000080,%ecx // select EFER register
+ rdmsr
+ or $(1 << 8),%eax // long mode enable bit
+ wrmsr
+ mov $p4table,%eax
+ mov %eax,%cr3
+ mov %cr0,%eax
+ or $(1 << 31),%eax // Paging bit
+ or $(1 << 16),%eax // Write-protect enabled also in kernel mode
+ mov %eax,%cr0
+
+ lgdt gdt64pointer
+ movw $0,%ax
+ movw %ax,%ds
+ movw %ax,%es
+ movw %ax,%fs
+ movw %ax,%gs
+ movw $16,%ax
+ movw %ax,%ds
+ movw %ax,%es
+ movw %ax,%ss
+ ljmp $8,$boot_entry64
+
+// .section .text.start
+ .code64
+
+ /* why do we need this? it seems overwritten by linker */
+ .globl _start
+_start:
+
+boot_entry64:
+ /* Switch to our own interrupt stack. */
+ movq $(_intstack+INTSTACK_SIZE),%rax
+ andq $(~15),%rax
+ movq %rax,%rsp
+
+ /* Reset EFLAGS to a known state. */
+ pushq $0
+ popf
+ // save multiboot info
+ movq %rbx,%r8
+
+ /* Fix ifunc entries */
+ movq $__rela_iplt_start,%rsi
+ movq $__rela_iplt_end,%rdi
+iplt_cont:
+ cmpq %rdi,%rsi
+ jae iplt_done
+ movq (%rsi),%rbx /* r_offset */
+ movb 4(%rsi),%al /* info */
+ cmpb $42,%al /* IRELATIVE */
+ jnz iplt_next
+ call *(%ebx) /* call ifunc */
+ movq %rax,(%rbx) /* fixed address */
+iplt_next:
+ addq $8,%rsi
+ jmp iplt_cont
+iplt_done:
+
+ movq %r8,%rdi
+ /* Jump into C code. */
+ call EXT(c_boot_entry)
+ /* not reached */
+ nop
+
+ .section .boot.data
+ .comm _intstack,INTSTACK_SIZE
+
+ .code32
+ .section .boot.data
+ .align 4096
+gdt64:
+ .quad 0
+gdt64code:
+ .quad (1<<44) | (1<<47) | (1<<41) | (1<<43) | (1<<53)
+gdt64data:
+ .quad (1<<44) | (1<<47) | (1<<41)
+gdt64end:
+ .skip (4096 - (gdt64end - gdt64))
+gdt64pointer:
+ // .word gdt64pointer - gdt64 - 1
+ .word gdt64end - gdt64 - 1
+ .quad gdt64
+
+ .section .boot.data
+ .align 4096
+p4table: .space 4096
+p3table: .space 4096
+p2table: .space 4096
+p2table1: .space 4096
+p2table2: .space 4096
+p2table3: .space 4096
+p3ktable: .space 4096
+p2ktable1: .space 4096
+p2ktable2: .space 4096
diff --git a/x86_64/interrupt.S b/x86_64/interrupt.S
index fccf6e28..eab643a5 100644
--- a/x86_64/interrupt.S
+++ b/x86_64/interrupt.S
@@ -41,12 +41,12 @@ ENTRY(interrupt)
movl 8(%esp),%edx /* set irq number as 3rd arg */
movl %edx,%ebx /* copy irq number */
shll $2,%ebx /* irq * 4 */
- movl EXT(iunit)(%ebx),%edi /* get device unit number as 1st arg */
+ movl EXT(iunit)(%rbx),%edi /* get device unit number as 1st arg */
movl %eax, %esi /* previous ipl as 2nd arg */
movq 16(%esp), %rcx /* return address as 4th arg */
movq 24(%esp), %r8 /* address of interrupted registers as
5th arg */
shll $1,%ebx /* irq * 8 */
- call *EXT(ivect)(%ebx) /* call interrupt handler */
+ call *EXT(ivect)(%rbx) /* call interrupt handler */
popq %rdi /* restore previous ipl */
call splx_cli /* restore previous ipl */
diff --git a/x86_64/ldscript b/x86_64/ldscript
index 375e8104..de99795e 100644
--- a/x86_64/ldscript
+++ b/x86_64/ldscript
@@ -2,7 +2,7 @@
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64",
"elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
-ENTRY(_start)
+ENTRY(boot_start)
SECTIONS
{
/*
@@ -11,22 +11,30 @@ SECTIONS
* be first in there. See also `i386/i386at/boothdr.S' and
* `gnumach_LINKFLAGS' in `i386/Makefrag.am'.
*/
- . = _START;
- .text :
- AT (_START_MAP)
+
+ . = _START_MAP;
+ .boot :
+ {
+ *(.boot.text)
+ *(.boot.data)
+ } =0x90909090
+
+ . += KERNEL_MAP_BASE;
+ _start = .;
+ .text : AT(((ADDR(.text)) - KERNEL_MAP_BASE))
{
- *(.text.start)
+ *(.text*)
*(.text .stub .text.* .gnu.linkonce.t.*)
*(.text.unlikely .text.*_unlikely)
KEEP (*(.text.*personality*))
/* .gnu.warning sections are handled specially by elf32.em. */
*(.gnu.warning)
} =0x90909090
- .init :
+ .init : AT(((ADDR(.init)) - KERNEL_MAP_BASE))
{
KEEP (*(.init))
} =0x90909090
- .fini :
+ .fini : AT(((ADDR(.fini)) - KERNEL_MAP_BASE))
{
KEEP (*(.fini))
} =0x90909090
@@ -69,7 +77,7 @@ SECTIONS
PROVIDE_HIDDEN (__rela_iplt_end = .);
}
.plt : { *(.plt) *(.iplt) }
- .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+ .rodata : AT(((ADDR(.rodata)) - KERNEL_MAP_BASE)) { *(.rodata
.rodata.* .gnu.linkonce.r.*) }
.rodata1 : { *(.rodata1) }
.eh_frame_hdr : { *(.eh_frame_hdr) }
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) }
@@ -139,7 +147,7 @@ SECTIONS
.got : { *(.got) *(.igot) }
. = DATA_SEGMENT_RELRO_END (24, .);
.got.plt : { *(.got.plt) *(.igot.plt) }
- .data :
+ .data : AT(((ADDR(.data)) - KERNEL_MAP_BASE))
{
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
@@ -147,7 +155,7 @@ SECTIONS
.data1 : { *(.data1) }
_edata = .; PROVIDE (edata = .);
__bss_start = .;
- .bss :
+ .bss : AT(((ADDR(.bss)) - KERNEL_MAP_BASE))
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 612fc493..a7266dab 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -1136,7 +1136,7 @@ syscall_native:
#endif
shll $5,%eax /* manual indexing of mach_trap_t */
xorq %r10,%r10
- movl EXT(mach_trap_table)(%eax),%r10d
+ mov EXT(mach_trap_table)(%rax),%r10
/* get number of arguments */
andq %r10,%r10
jz mach_call_call /* skip argument copy if none */
@@ -1184,7 +1184,7 @@ mach_call_call:
0:
#endif /* DEBUG */
- call *EXT(mach_trap_table)+8(%eax)
+ call *EXT(mach_trap_table)+8(%rax)
/* call procedure */
movq %rsp,%rcx /* get kernel stack */
or $(KERNEL_STACK_SIZE-1),%rcx
--
2.30.2
- [PATCH 0/6] Add initial support for booting x86_64 from grub, Luca Dariz, 2022/01/28
- [PATCH 4/6] fix console setting from cmdline, Luca Dariz, 2022/01/28
- [PATCH 3/6] fix register corruption in irq on qemu, Luca Dariz, 2022/01/28
- [PATCH 2/6] cleanup multiboot, Luca Dariz, 2022/01/28
- [PATCH 1/6] add support for booting from grub with x86_64,
Luca Dariz <=
- [PATCH 6/6] fix Task State Segment layout for 64 bit, Luca Dariz, 2022/01/28
- [PATCH 5/6] enable user access, Luca Dariz, 2022/01/28