For now, I am using the floppy model. Pretending a final executable is a 1.44M floppy and the bootloader, kernel loader and kernel itself are just appended on to each other on the disk image. The loader I am about to provide also has the added benefit of loading the file one sector at a time, no matter how large, so that it can take a kernel of arbitrary size and load it.
It sets a generic page table allowing access to the first 16M of memory, loads 16 bit, 32 and 64 bit descriptors in the GDT, giving the final segment as 0x28 when it launches into 64 bit mode. It also enables the SSE registers, since a lot of the code made by my C++ compiler of choice (clang) has SSE registers in heavy use. Why not? Clear 128 bits of memory or more in one instruction vs 64. An able programmer should be able to adapt this to use on a hard drive and to make it a little more dynamic. This file is intended to be loaded by the MBR and executed at 0x6000
%include "loaderconstants.inc"[ORG 0x6000][BITS 16];; Store boot driveMOV [bootDrive],DL;; Read first sector of ELF image and get needed data from itXOR EAX,EAXMOV AX,kernelLBAAddrCALL ReadSector;; First lets make sure its actually an ELFMOV EAX,0x464c457fCMP DWORD EAX,[0x7000]JNZ badELF; Make sure its 64 bit little endianMOV AX,0x0102CMP WORD AX,[0x7004]JNZ badELF;; Enter unreal mode, must be done before using copyData function; Lets assume we have a computer built after 1997MOV AX,0x2401INT 0x15CLILGDT [GDTR]MOV EAX,CR0INC EAXMOV CR0,EAXMOV BX,0x20MOV FS,BXDEC EAXMOV CR0,EAXSTI;; Now we should be in 16 bit REAL mode with access to the first 4G of RAM through FSMOV BX,[0x703C]MOV EAX,[0x7018]MOV [krnlEntry],EAX;; GET SHT Address;; We use the Section Header instead of the PHT, so that we can have an extra section in a seperate location, ie an;; x86 Real mode interrupt handler at 0x5000 while the kernel itself resides at 1MBMOV DWORD ESI,[0x7028]; Get sizeof(SHT)MOV AX,[0x703A] ; Size of entryMUL BX ; num entriesXOR ECX,ECXMOV CX,AXMOV EDI,0x8000;; Copy SHT to 0x8000 using our data read functionsCALL copyData;; Now that we have our sections in memory, lets go through them one by one and load themXCHG BX,CXMOV BX,0x8010elfLoop:PUSH CXMOV EDI,[BX] ;; Destination addressADD BX,8MOV ESI,[BX] ;; Offset into fileADD BX,8MOV ECX,[BX] ;; Size;; None of these can be zeroCMP EDI,DWORD 0JZ .elSkipSectionCMP ESI,DWORD 0JZ .elSkipSectionCMP ECX,DWORD 0JZ .elSkipSectionCMP DWORD [stackStart],0JNZ .itsLoadedMOV DWORD [stackStart],EDI.itsLoaded:CALL copyDataMOV DWORD [mallocStart],EDI.elSkipSection:ADD BX,0x30POP CXLOOP elfLoop;; Create page tables, assume 2MB pages are okay;; Identity map the first 16MB We can set the rest up inside the kernel, for now;; we know that we have at LEAST 16MMOV EDI,0x10000MOV DWORD [FS:EDI],0x11003ADD EDI,0x1000MOV DWORD [FS:EDI],0x12003ADD EDI,0x1000MOV DWORD [FS:EDI],0x000083ADD EDI,0x8MOV DWORD [FS:EDI],0x200083ADD EDI,0x8MOV DWORD [FS:EDI],0x400083;ADD EDI,0x8MOV DWORD [FS:EDI],0x600083;ADD EDI,0x8MOV DWORD [FS:EDI],0x800083;ADD EDI,0x8MOV DWORD [FS:EDI],0xa00083;ADD EDI,0x8MOV DWORD [FS:EDI],0xc00083;ADD EDI,0x8MOV DWORD [FS:EDI],0xe00083;;; Get int 15 memory map and store the pmode idt in preperation for bios calls from the kernelSIDT [0x7000]; use the INT 0x15, eax= 0xE820 BIOS function to get a memory map; inputs: es:di -> destination buffer for 24 byte entries; outputs: bp = entry count, trashes all registers except esiMOV DI,0x7012xor ebx, ebx ; ebx must be 0 to startxor bp, bp ; keep an entry count in bpmov edx, 0x0534D4150 ; Place "SMAP" into edxmov eax, 0xe820mov [es:di + 20], dword 1 ; force a valid ACPI 3.X entrymov ecx, 24 ; ask for 24 bytesint 0x15jc short .failed ; carry set on first call means "unsupported function"mov edx, 0x0534D4150 ; Some BIOSes apparently trash this register?cmp eax, edx ; on success, eax must have been reset to "SMAP"jne short .failedtest ebx, ebx ; ebx = 0 implies list is only 1 entry long (worthless)je short .failedjmp short .jmpin.e820lp:mov eax, 0xe820 ; eax, ecx get trashed on every int 0x15 callmov [es:di + 20], dword 1 ; force a valid ACPI 3.X entrymov ecx, 24 ; ask for 24 bytes againint 0x15jc short .e820f ; carry set means "end of list already reached"mov edx, 0x0534D4150 ; repair potentially trashed register.jmpin:jcxz .skipent ; skip any 0 length entriescmp cl, 20 ; got a 24 byte ACPI 3.X response?jbe short .notexttest byte [es:di + 20], 1 ; if so: is the "ignore this data" bit clear?je short .skipent.notext:mov ecx, [es:di + 8] ; get lower dword of memory region lengthor ecx, [es:di + 12] ; "or" it with upper dword to test for zerojz .skipent ; if length qword is 0, skip entryinc bp ; got a good entry: ++count, move to next storage spotadd di, 24.skipent:test ebx, ebx ; if ebx resets to 0, list is completejne short .e820lp.e820f:mov [0x7010], bp ; store the entry countclc ; there is "jc" on end of list to this point, so the carry must be clearedJMP LetsGo.failed:stc ; "function unsupported" error exitJMP LetsGoLetsGo:;; Lets jump from 16 bit to 32 to 64 then to the kernelCLI ;; Goodbye interrupts until we are in C++ codeMOV EAX,CR0INC EAXMOV CR0,EAXJMP 0x18:mode32mode32:[BITS 32]MOV AX,0x20MOV DS,AXMOV DX,0x3F2 ;; Turn the floppy motor off, its annoying!MOV AL,0xCOUT DX,AL;; Set PAE and PGE bitMOV EAX, 10100000bMOV CR4,EAXMOV EDI,0x10000MOV CR3,EDIMOV ECX, 0xC0000080 ; Read from the EFER MSR.RDMSROR EAX, 0x00000500 ; Set the LME bit.WRMSRMOV EBX,CR0 ; Activate long mode -OR EBX,0x80000001 ; - by enabling paging and protection simultaneously.MOV CR0,EBX;; Now lets set up and activate all of that fancy math coprocessor support;; SSE InstructionsMOV EAX,CR0AND AX,0xfffbOR AX,2MOV CR0,EAXMOV EAX,CR4OR AX,3 << 9MOV CR4,EAXJMP 0x28: longModelongMode:[BITS 64]MOV AX,0x30MOV DS,AXMOV ES,AXMOV FS,AXMOV GS,AXMOV SS,AXXOR RSP,RSPMOV ESP,[stackStart]MOV QWORD RAX,[krnlEntry]XOR RDI,RDIMOV EDI,[mallocStart]MOV RBP,RSPCALL RAXCLIHLT[BITS 16]RET;; Functions;; Copies data from ESI bytes into the file to address EDI of size ECX bytes;; Dynamically loads sectors as neededcopyData:PUSH EBXPUSH ESIPUSH EAXPUSH EDXPUSH ECX;; First get starting sectorXOR EAX,EAXXOR EDX,EDXMOV EAX,ESIMOV EBX,512DIV EBXADD EAX,kernelLBAAddrCALL ReadSector;; Copy from first sectorMOV ECX,0x200SUB ECX,EDX ;; ecx has rest of sector countPOP EBX ;; actual requested bytes in ebxCMP EBX,ECX ;; Is it less? Can it all really fit in one sector?JC .onlyOneNeeded ;; YupSUB EBX,ECXPUSH EBXJMP .doCopy.onlyOneNeeded:XCHG EBX,ECXPUSH DWORD 0.doCopy:MOV ESI,EDXADD ESI,0x7000CALL copyBytes;; Ok, how much is left?.cdSectorLoop:POP ECXCMP ECX,0JZ .cdDone ;; No more data?CMP ECX,0x200JC .cdLastSector ;; Less than one sector of data left;; Read a whole sector and transfer up to destinationSUB ECX,0x200PUSH ECXINC EAXCALL ReadSectorMOV ECX,0x200MOV ESI,0x7000CALL copyBytesJMP .cdSectorLoop.cdLastSector:INC EAXCALL ReadSectorMOV ESI,0x7000CALL copyBytes.cdDone:POP EDXPOP EAXPOP ESIPOP EBXRET;; Copies bytes from esi to edi;; We have to do this this way since 16 bit rep movsb will only do 64k of ram, this can access the first 4GcopyBytes:PUSH AX.cbLoop:MOV AL,[FS:ESI]MOV [FS:EDI],ALINC ESIINC EDILOOP .cbLoopPOP AXRET;; Read a sector with the LBA address in EAX into 0x7000ReadSector:PUSHADMOV [currSector],EAXCALL incrementSpinnerMOV DL,[bootDrive]CMP DL,0x80JNC .readHDD; We dont need dword support for a floppyCALL LBAtoCHSMOV DL,[bootDrive]MOV AX,0x201MOV BX,0x7000INT 0x13JC readErrorPOPADRET.readHDD:MOV DWORD [HDDReadPacket.sector],EAXMOV AX,0x4200MOV SI,HDDReadPacketINT 0x13JC readErrorPOPADRET;; Converts LBA to CHS address for a 1.44 floppyLBAtoCHS:;[in AX=LBA Sector];[out DX,CX]XOR CX,CXXOR DX,DXDIV WORD [flpSecTrk]INC DXMOV CL,DLXOR DX,DXDIV WORD [flpHds]MOV DH,DLMOV CH,ALRET;; Incrememnts the spinner so that the user can see something is happeningincrementSpinner:PUSH SIPUSH CXMOV SI,txtSpinnerXOR CX,CXMOV CL,[txtSpPos]INC CL.incrementSpinner1:ADD SI,3LOOP .incrementSpinner1MOV CL,[txtSpPos]CALL printStringINC CLCMP CL,4JLE .incrementSpinnerOutMOV CL,0.incrementSpinnerOut:MOV [txtSpPos],CLPOP CXPOP SIRETprintString:PUSH AXPUSH BXPUSH CXMOV AH,0xeXOR BX,BXXOR CX,CX.printStringLoop:LODSBTEST AL,ALJZ .printStringExitINT 0x10JMP .printStringLoop.printStringExit:POP CXPOP BXPOP AXRET;; Error functionsreadError:MOV SI,readErrorStrCALL printStringCLIHLTreadErrorStr db 13,10,13,10,"Disk Read error",0badELF:MOV SI,badELFStrCALL printStringCLIHLTbadELFStr db 13,10,13,10,"Corrupted ELF Image!",0;; DatatxtSpinner db 0,0,0,"/",8,0,"-",8,0,"\",8,0,"|",8,0,".",0txtSpPos db 0bootDrive db 0currSector dd 0flpSecTrk dw 18flpHds dw 2krnlEntry dq 0mallocStart dd 0stackStart dd 0HDDReadPacket:;; Some of these values are staticdb 0x10db 0dw 1dw 0x7000dw 0.sector dq 0ALIGN 8GDT:dq 0;; 16 Bitdd 0x0000ffff ;; Code 0x8dd 0x00009c00dd 0x0000ffff ;; Data 0x10dd 0x00009200;; 32 Bit Segmentsdd 0x0000ffff ;; Code 0x18dd 0x00cf9c00dd 0x0000ffff ;; Data 0x20dd 0x00cf9200;; 64 bitdq 0x002f98000000ffff ; Code 0x28dq 0x002f92000000ffff ; Data 0x30GDTR:dw (GDTR-GDT)-1dd GDTTIMES (512 * (loaderNumSects))-($-$$) DB 90