Commit 968de4f0 authored by Eric W. Biederman's avatar Eric W. Biederman Committed by Andi Kleen

[PATCH] i386: Relocatable kernel support

This patch modifies the i386 kernel so that if CONFIG_RELOCATABLE is
selected it will be able to be loaded at any 4K aligned address below
1G.  The technique used is to compile the decompressor with -fPIC and
modify it so the decompressor is fully relocatable.  For the main
kernel relocations are generated.  Resulting in a kernel that is relocatable
with no runtime overhead and no need to modify the source code.

A reserved 32bit word in the parameters has been assigned
to serve as a stack so we figure out where are running.
Signed-off-by: default avatarEric W. Biederman <ebiederm@xmission.com>
Signed-off-by: default avatarVivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: default avatarAndi Kleen <ak@suse.de>
parent fd593d12
......@@ -773,6 +773,18 @@ config CRASH_DUMP
PHYSICAL_START.
For more details see Documentation/kdump/kdump.txt
config RELOCATABLE
bool "Build a relocatable kernel"
help
This build a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
The relocations tend to the kernel binary about 10% larger,
but are discarded at runtime.
One use is for the kexec on panic case where the recovery kernel
must live at a different physical address than the primary
kernel.
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
......
......@@ -26,7 +26,9 @@ endif
LDFLAGS := -m elf_i386
OBJCOPYFLAGS := -O binary -R .note -R .comment -S
LDFLAGS_vmlinux :=
ifdef CONFIG_RELOCATABLE
LDFLAGS_vmlinux := --emit-relocs
endif
CHECKFLAGS += -D__i386__
CFLAGS += -pipe -msoft-float
......
......@@ -4,22 +4,42 @@
# create a compressed vmlinux image from the original vmlinux
#
targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \
vmlinux.bin.all vmlinux.relocs
EXTRA_AFLAGS := -traditional
LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32
LDFLAGS_vmlinux := -T
CFLAGS_misc.o += -fPIC
hostprogs-y := relocs
$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
$(call if_changed,ld)
@:
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(obj)/relocs $< > $@
$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
$(call if_changed,relocs)
vmlinux.bin.all-y := $(obj)/vmlinux.bin
vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
quiet_cmd_relocbin = BUILD $@
cmd_relocbin = cat $(filter-out FORCE,$^) > $@
$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
$(call if_changed,relocbin)
ifdef CONFIG_RELOCATABLE
$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
$(call if_changed,gzip)
else
$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
$(call if_changed,gzip)
endif
LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
$(obj)/piggy.o: $(src)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
$(call if_changed,ld)
......@@ -25,9 +25,11 @@
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
.section ".text.head"
.globl startup_32
startup_32:
cld
cli
......@@ -36,93 +38,141 @@ startup_32:
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
movl %eax,%ss
lss stack_start,%esp
xorl %eax,%eax
1: incl %eax # check that A20 really IS enabled
movl %eax,0x000000 # loop forever if it isn't
cmpl %eax,0x100000
je 1b
/* Calculate the delta between where we were compiled to run
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
* data at 0x34-0x3f are used as the stack for this calculation.
* Only 4 bytes are needed.
*/
leal 0x40(%esi), %esp
call 1f
1: popl %ebp
subl $1b, %ebp
/* Compute the delta between where we were compiled to run at
* and where the code will actually run at.
*/
/* Start with the delta to where the kernel will run at. If we are
* a relocatable kernel this is the delta to our load address otherwise
* this is the delta to CONFIG_PHYSICAL start.
*/
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
#else
movl $(CONFIG_PHYSICAL_START - startup_32), %ebx
#endif
/* Replace the compressed data size with the uncompressed size */
subl input_len(%ebp), %ebx
movl output_len(%ebp), %eax
addl %eax, %ebx
/* Add 8 bytes for every 32K input block */
shrl $12, %eax
addl %eax, %ebx
/* Add 32K + 18 bytes of extra slack */
addl $(32768 + 18), %ebx
/* Align on a 4K boundary */
addl $4095, %ebx
andl $~4095, %ebx
/* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
pushl %esi
leal _end(%ebp), %esi
leal _end(%ebx), %edi
movl $(_end - startup_32), %ecx
std
rep
movsb
cld
popl %esi
/* Compute the kernel start address.
*/
#ifdef CONFIG_RELOCATABLE
leal startup_32(%ebp), %ebp
#else
movl $CONFIG_PHYSICAL_START, %ebp
#endif
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to protected mode.
* Jump to the relocated address.
*/
pushl $0
popfl
leal relocated(%ebx), %eax
jmp *%eax
.section ".text"
relocated:
/*
* Clear BSS
*/
xorl %eax,%eax
movl $_edata,%edi
movl $_end,%ecx
leal _edata(%ebx),%edi
leal _end(%ebx), %ecx
subl %edi,%ecx
cld
rep
stosb
/*
* Setup the stack for the decompressor
*/
leal stack_end(%ebx), %esp
/*
* Do the decompression, and jump to the new kernel..
*/
subl $16,%esp # place for structure on the stack
movl %esp,%eax
movl output_len(%ebx), %eax
pushl %eax
pushl %ebp # output address
movl input_len(%ebx), %eax
pushl %eax # input_len
leal input_data(%ebx), %eax
pushl %eax # input_data
leal _end(%ebx), %eax
pushl %eax # end of the image as third argument
pushl %esi # real mode pointer as second arg
pushl %eax # address of structure as first arg
call decompress_kernel
orl %eax,%eax
jnz 3f
popl %esi # discard address
popl %esi # real mode pointer
xorl %ebx,%ebx
ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START
addl $20, %esp
popl %ecx
#if CONFIG_RELOCATABLE
/* Find the address of the relocations.
*/
movl %ebp, %edi
addl %ecx, %edi
/* Calculate the delta between where vmlinux was compiled to run
* and where it was actually loaded.
*/
movl %ebp, %ebx
subl $CONFIG_PHYSICAL_START, %ebx
/*
* We come here, if we were loaded high.
* We need to move the move-in-place routine down to 0x1000
* and then start it with the buffer addresses in registers,
* which we got from the stack.
* Process relocations.
*/
3:
movl $move_routine_start,%esi
movl $0x1000,%edi
movl $move_routine_end,%ecx
subl %esi,%ecx
addl $3,%ecx
shrl $2,%ecx
cld
rep
movsl
popl %esi # discard the address
popl %ebx # real mode pointer
popl %esi # low_buffer_start
popl %ecx # lcount
popl %edx # high_buffer_start
popl %eax # hcount
movl $CONFIG_PHYSICAL_START,%edi
cli # make sure we don't get interrupted
ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
1: subl $4, %edi
movl 0(%edi), %ecx
testl %ecx, %ecx
jz 2f
addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
jmp 1b
2:
#endif
/*
* Routine (template) for moving the decompressed kernel in place,
* if we were high loaded. This _must_ PIC-code !
* Jump to the decompressed kernel.
*/
move_routine_start:
movl %ecx,%ebp
shrl $2,%ecx
rep
movsl
movl %ebp,%ecx
andl $3,%ecx
rep
movsb
movl %edx,%esi
movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
addl $3,%ecx
shrl $2,%ecx
rep
movsl
movl %ebx,%esi # Restore setup pointer
xorl %ebx,%ebx
ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START
move_routine_end:
jmp *%ebp
.bss
.balign 4
stack:
.fill 4096, 1, 0
stack_end:
......@@ -13,6 +13,88 @@
#include <linux/vmalloc.h>
#include <linux/screen_info.h>
#include <asm/io.h>
#include <asm/page.h>
/* WARNING!!
* This code is compiled with -fPIC and it is relocated dynamically
* at run time, but no relocation processing is performed.
* This means that it is not safe to place pointers in static structures.
*/
/*
* Getting to provable safe in place decompression is hard.
* Worst case behaviours need to be analized.
* Background information:
*
* The file layout is:
* magic[2]
* method[1]
* flags[1]
* timestamp[4]
* extraflags[1]
* os[1]
* compressed data blocks[N]
* crc[4] orig_len[4]
*
* resulting in 18 bytes of non compressed data overhead.
*
* Files divided into blocks
* 1 bit (last block flag)
* 2 bits (block type)
*
* 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
* The smallest block type encoding is always used.
*
* stored:
* 32 bits length in bytes.
*
* fixed:
* magic fixed tree.
* symbols.
*
* dynamic:
* dynamic tree encoding.
* symbols.
*
*
* The buffer for decompression in place is the length of the
* uncompressed data, plus a small amount extra to keep the algorithm safe.
* The compressed data is placed at the end of the buffer. The output
* pointer is placed at the start of the buffer and the input pointer
* is placed where the compressed data starts. Problems will occur
* when the output pointer overruns the input pointer.
*
* The output pointer can only overrun the input pointer if the input
* pointer is moving faster than the output pointer. A condition only
* triggered by data whose compressed form is larger than the uncompressed
* form.
*
* The worst case at the block level is a growth of the compressed data
* of 5 bytes per 32767 bytes.
*
* The worst case internal to a compressed block is very hard to figure.
* The worst case can at least be boundined by having one bit that represents
* 32764 bytes and then all of the rest of the bytes representing the very
* very last byte.
*
* All of which is enough to compute an amount of extra data that is required
* to be safe. To avoid problems at the block level allocating 5 extra bytes
* per 32767 bytes of data is sufficient. To avoind problems internal to a block
* adding an extra 32767 bytes (the worst case uncompressed block size) is
* sufficient, to ensure that in the worst case the decompressed data for
* block will stop the byte before the compressed data for a block begins.
* To avoid problems with the compressed data's meta information an extra 18
* bytes are needed. Leading to the formula:
*
* extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
*
* Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
* Adding 32768 instead of 32767 just makes for round numbers.
* Adding the decompressor_size is necessary as it musht live after all
* of the data as well. Last I measured the decompressor is about 14K.
* 10K of actuall data and 4K of bss.
*
*/
/*
* gzip declarations
......@@ -29,15 +111,20 @@ typedef unsigned char uch;
typedef unsigned short ush;
typedef unsigned long ulg;
#define WSIZE 0x8000 /* Window size must be at least 32k, */
/* and a power of two */
#define WSIZE 0x80000000 /* Window size must be at least 32k,
* and a power of two
* We don't actually have a window just
* a huge output buffer so I report
* a 2G windows size, as that should
* always be larger than our output buffer.
*/
static uch *inbuf; /* input buffer */
static uch window[WSIZE]; /* Sliding window buffer */
static uch *inbuf; /* input buffer */
static uch *window; /* Sliding window buffer, (and final output buffer) */
static unsigned insize = 0; /* valid bytes in inbuf */
static unsigned inptr = 0; /* index of next byte to be processed in inbuf */
static unsigned outcnt = 0; /* bytes in output buffer */
static unsigned insize; /* valid bytes in inbuf */
static unsigned inptr; /* index of next byte to be processed in inbuf */
static unsigned outcnt; /* bytes in output buffer */
/* gzip flag byte */
#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
......@@ -88,8 +175,6 @@ extern unsigned char input_data[];
extern int input_len;
static long bytes_out = 0;
static uch *output_data;
static unsigned long output_ptr = 0;
static void *malloc(int size);
static void free(void *where);
......@@ -99,17 +184,10 @@ static void *memcpy(void *dest, const void *src, unsigned n);
static void putstr(const char *);
extern int end;
static long free_mem_ptr = (long)&end;
static long free_mem_end_ptr;
static unsigned long free_mem_ptr;
static unsigned long free_mem_end_ptr;
#define INPLACE_MOVE_ROUTINE 0x1000
#define LOW_BUFFER_START 0x2000
#define LOW_BUFFER_MAX 0x90000
#define HEAP_SIZE 0x3000
static unsigned int low_buffer_end, low_buffer_size;
static int high_loaded =0;
static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
static char *vidmem = (char *)0xb8000;
static int vidport;
......@@ -150,7 +228,7 @@ static void gzip_mark(void **ptr)
static void gzip_release(void **ptr)
{
free_mem_ptr = (long) *ptr;
free_mem_ptr = (unsigned long) *ptr;
}
static void scroll(void)
......@@ -178,7 +256,7 @@ static void putstr(const char *s)
y--;
}
} else {
vidmem [ ( x + cols * y ) * 2 ] = c;
vidmem [ ( x + cols * y ) * 2 ] = c;
if ( ++x >= cols ) {
x = 0;
if ( ++y >= lines ) {
......@@ -223,58 +301,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
*/
static int fill_inbuf(void)
{
if (insize != 0) {
error("ran out of input data");
}
inbuf = input_data;
insize = input_len;
inptr = 1;
return inbuf[0];
error("ran out of input data");
return 0;
}
/* ===========================================================================
* Write the output window window[0..outcnt-1] and update crc and bytes_out.
* (Used for the decompressed data only.)
*/
static void flush_window_low(void)
{
ulg c = crc; /* temporary variable */
unsigned n;
uch *in, *out, ch;
in = window;
out = &output_data[output_ptr];
for (n = 0; n < outcnt; n++) {
ch = *out++ = *in++;
c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
}
crc = c;
bytes_out += (ulg)outcnt;
output_ptr += (ulg)outcnt;
outcnt = 0;
}
static void flush_window_high(void)
{
ulg c = crc; /* temporary variable */
unsigned n;
uch *in, ch;
in = window;
for (n = 0; n < outcnt; n++) {
ch = *output_data++ = *in++;
if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
}
crc = c;
bytes_out += (ulg)outcnt;
outcnt = 0;
}
static void flush_window(void)
{
if (high_loaded) flush_window_high();
else flush_window_low();
/* With my window equal to my output buffer
* I only need to compute the crc here.
*/
ulg c = crc; /* temporary variable */
unsigned n;
uch *in, ch;
in = window;
for (n = 0; n < outcnt; n++) {
ch = *in++;
c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
}
crc = c;
bytes_out += (ulg)outcnt;
outcnt = 0;
}
static void error(char *x)
......@@ -286,66 +337,8 @@ static void error(char *x)
while(1); /* Halt */
}
#define STACK_SIZE (4096)
long user_stack [STACK_SIZE];
struct {
long * a;
short b;
} stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS };
static void setup_normal_output_buffer(void)
{
#ifdef STANDARD_MEMORY_BIOS_CALL
if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
#else
if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
#endif
output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */
free_mem_end_ptr = (long)real_mode;
}
struct moveparams {
uch *low_buffer_start; int lcount;
uch *high_buffer_start; int hcount;
};
static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
{
high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
#ifdef STANDARD_MEMORY_BIOS_CALL
if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
#else
if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
#endif
mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
low_buffer_size = low_buffer_end - LOW_BUFFER_START;
high_loaded = 1;
free_mem_end_ptr = (long)high_buffer_start;
if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size);
mv->hcount = 0; /* say: we need not to move high_buffer */
}
else mv->hcount = -1;
mv->high_buffer_start = high_buffer_start;
}
static void close_output_buffer_if_we_run_high(struct moveparams *mv)
{
if (bytes_out > low_buffer_size) {
mv->lcount = low_buffer_size;
if (mv->hcount)
mv->hcount = bytes_out - low_buffer_size;
} else {
mv->lcount = bytes_out;
mv->hcount = 0;
}
}
asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
asmlinkage void decompress_kernel(void *rmode, unsigned long end,
uch *input_data, unsigned long input_len, uch *output)
{
real_mode = rmode;
......@@ -360,13 +353,25 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
lines = RM_SCREEN_INFO.orig_video_lines;
cols = RM_SCREEN_INFO.orig_video_cols;
if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
else setup_output_buffer_if_we_run_high(mv);
window = output; /* Output buffer (Normally at 1M) */
free_mem_ptr = end; /* Heap */
free_mem_end_ptr = end + HEAP_SIZE;
inbuf = input_data; /* Input buffer */
insize = input_len;
inptr = 0;