diff options
author | Oliver Schinagl <oliver@schinagl.nl> | 2011-04-27 13:13:05 (GMT) |
---|---|---|
committer | Oliver Schinagl <oliver@schinagl.nl> | 2011-04-27 13:13:05 (GMT) |
commit | cb589e64ddfbc502e8b1189ec7253c43b42cd183 (patch) | |
tree | a45aa4df23db84c279f39bd2c894ecf6bada0289 /uClinux-2.4.31-uc0/mm | |
parent | d53ae4b2067e5e7c4f5a0b9a234a89e0582c2e84 (diff) | |
download | openipcam-cb589e64ddfbc502e8b1189ec7253c43b42cd183.zip openipcam-cb589e64ddfbc502e8b1189ec7253c43b42cd183.tar.gz openipcam-cb589e64ddfbc502e8b1189ec7253c43b42cd183.tar.bz2 |
linux-2.4.31 with uCLinux uc0 pre-patched
Diffstat (limited to 'uClinux-2.4.31-uc0/mm')
-rw-r--r-- | uClinux-2.4.31-uc0/mm/Makefile | 21 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/bootmem.c | 359 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/filemap.c | 3406 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/highmem.c | 454 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/memory.c | 1504 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/mlock.c | 301 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/mmap.c | 1256 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/mprotect.c | 337 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/mremap.c | 383 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/numa.c | 130 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/oom_kill.c | 298 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/page_alloc.c | 969 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/page_io.c | 120 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/shmem.c | 1753 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/slab.c | 2078 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/swap.c | 185 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/swap_state.c | 231 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/swapfile.c | 1268 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/vmalloc.c | 384 | ||||
-rw-r--r-- | uClinux-2.4.31-uc0/mm/vmscan.c | 858 |
20 files changed, 16295 insertions, 0 deletions
diff --git a/uClinux-2.4.31-uc0/mm/Makefile b/uClinux-2.4.31-uc0/mm/Makefile new file mode 100644 index 0000000..c379af2 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/Makefile @@ -0,0 +1,21 @@ +# +# Makefile for the linux memory manager. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := mm.o + +export-objs := shmem.o filemap.o memory.o page_alloc.o + +obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ + vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ + page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ + shmem.o + +obj-$(CONFIG_HIGHMEM) += highmem.o + +include $(TOPDIR)/Rules.make diff --git a/uClinux-2.4.31-uc0/mm/bootmem.c b/uClinux-2.4.31-uc0/mm/bootmem.c new file mode 100644 index 0000000..a473e25 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/bootmem.c @@ -0,0 +1,359 @@ +/* + * linux/mm/bootmem.c + * + * Copyright (C) 1999 Ingo Molnar + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * + * simple boot-time physical memory area allocator and + * free memory collector. It's used to deal with reserved + * system memory and memory holes as well. + */ + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <asm/dma.h> +#include <asm/io.h> + +/* + * Access to this subsystem has to be serialized externally. (this is + * true for the boot process anyway) + */ +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +/* return the number of _pages_ that will be allocated for the boot bitmap */ +unsigned long __init bootmem_bootmap_pages (unsigned long pages) +{ + unsigned long mapsize; + + mapsize = (pages+7)/8; + mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; + mapsize >>= PAGE_SHIFT; + + return mapsize; +} + +/* + * Called once to set up the allocator itself. + */ +static unsigned long __init init_bootmem_core (pg_data_t *pgdat, + unsigned long mapstart, unsigned long start, unsigned long end) +{ + bootmem_data_t *bdata = pgdat->bdata; + unsigned long mapsize = ((end - start)+7)/8; + + pgdat->node_next = pgdat_list; + pgdat_list = pgdat; + + mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); + bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); + bdata->node_boot_start = (start << PAGE_SHIFT); + bdata->node_low_pfn = end; + + /* + * Initially all pages are reserved - setup_arch() has to + * register free RAM areas explicitly. + */ + memset(bdata->node_bootmem_map, 0xff, mapsize); + + return mapsize; +} + +/* + * Marks a particular physical memory range as unallocatable. Usable RAM + * might be used for boot-time allocations - or it might get added + * to the free page pool later on. + */ +static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +{ + unsigned long i; + /* + * round up, partially reserved pages are considered + * fully reserved. + */ + unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; + unsigned long eidx = (addr + size - bdata->node_boot_start + + PAGE_SIZE-1)/PAGE_SIZE; + unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + + if (!size) BUG(); + + if (sidx < 0) + BUG(); + if (eidx < 0) + BUG(); + if (sidx >= eidx) + BUG(); + if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) + BUG(); + if (end > bdata->node_low_pfn) + BUG(); + for (i = sidx; i < eidx; i++) + if (test_and_set_bit(i, bdata->node_bootmem_map)) + printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); +} + +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +{ + unsigned long i; + unsigned long start; + /* + * round down end of usable mem, partially free pages are + * considered reserved. + */ + unsigned long sidx; + unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; + unsigned long end = (addr + size)/PAGE_SIZE; + + if (!size) BUG(); + if (end > bdata->node_low_pfn) + BUG(); + + /* + * Round up the beginning of the address. + */ + start = (addr + PAGE_SIZE-1) / PAGE_SIZE; + sidx = start - (bdata->node_boot_start/PAGE_SIZE); + + for (i = sidx; i < eidx; i++) { + if (!test_and_clear_bit(i, bdata->node_bootmem_map)) + BUG(); + } +} + +/* + * We 'merge' subsequent allocations to save space. We might 'lose' + * some fraction of a page if allocations cannot be satisfied due to + * size constraints on boxes where there is physical RAM space + * fragmentation - in these cases * (mostly large memory boxes) this + * is not a problem. + * + * On low memory boxes we get it right in 100% of the cases. + */ + +/* + * alignment has to be a power of 2 value. + */ +static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, + unsigned long size, unsigned long align, unsigned long goal) +{ + unsigned long i, start = 0; + void *ret; + unsigned long offset, remaining_size; + unsigned long areasize, preferred, incr; + unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >> + PAGE_SHIFT); + + if (!size) BUG(); + + if (align & (align-1)) + BUG(); + + offset = 0; + if (align && + (bdata->node_boot_start & (align - 1UL)) != 0) + offset = (align - (bdata->node_boot_start & (align - 1UL))); + offset >>= PAGE_SHIFT; + + /* + * We try to allocate bootmem pages above 'goal' + * first, then we try to allocate lower pages. + */ + if (goal && (goal >= bdata->node_boot_start) && + ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { + preferred = goal - bdata->node_boot_start; + } else + preferred = 0; + + preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; + preferred += offset; + areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; + incr = align >> PAGE_SHIFT ? : 1; + +restart_scan: + for (i = preferred; i < eidx; i += incr) { + unsigned long j; + if (test_bit(i, bdata->node_bootmem_map)) + continue; + for (j = i + 1; j < i + areasize; ++j) { + if (j >= eidx) + goto fail_block; + if (test_bit (j, bdata->node_bootmem_map)) + goto fail_block; + } + start = i; + goto found; + fail_block:; + } + if (preferred) { + preferred = offset; + goto restart_scan; + } + return NULL; +found: + if (start >= eidx) + BUG(); + + /* + * Is the next page of the previous allocation-end the start + * of this allocation's buffer? If yes then we can 'merge' + * the previous partial page with this allocation. + */ + if (align <= PAGE_SIZE + && bdata->last_offset && bdata->last_pos+1 == start) { + offset = (bdata->last_offset+align-1) & ~(align-1); + if (offset > PAGE_SIZE) + BUG(); + remaining_size = PAGE_SIZE-offset; + if (size < remaining_size) { + areasize = 0; + // last_pos unchanged + bdata->last_offset = offset+size; + ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + bdata->node_boot_start); + } else { + remaining_size = size - remaining_size; + areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; + ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + bdata->node_boot_start); + bdata->last_pos = start+areasize-1; + bdata->last_offset = remaining_size; + } + bdata->last_offset &= ~PAGE_MASK; + } else { + bdata->last_pos = start + areasize - 1; + bdata->last_offset = size & ~PAGE_MASK; + ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + } + /* + * Reserve the area now: + */ + for (i = start; i < start+areasize; i++) + if (test_and_set_bit(i, bdata->node_bootmem_map)) + BUG(); + memset(ret, 0, size); + return ret; +} + +static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) +{ + struct page *page = pgdat->node_mem_map; + bootmem_data_t *bdata = pgdat->bdata; + unsigned long i, count, total = 0; + unsigned long idx; + + if (!bdata->node_bootmem_map) BUG(); + + count = 0; + idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + for (i = 0; i < idx; i++, page++) { + if (!test_bit(i, bdata->node_bootmem_map)) { + count++; + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + } + total += count; + + /* + * Now free the allocator bitmap itself, it's not + * needed anymore: + */ + page = virt_to_page(bdata->node_bootmem_map); + count = 0; + for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { + count++; + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + total += count; + bdata->node_bootmem_map = NULL; + + return total; +} + +unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +{ + return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); +} + +void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +{ + reserve_bootmem_core(pgdat->bdata, physaddr, size); +} + +void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +{ + return(free_bootmem_core(pgdat->bdata, physaddr, size)); +} + +unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) +{ + return(free_all_bootmem_core(pgdat)); +} + +unsigned long __init init_bootmem (unsigned long start, unsigned long pages) +{ + max_low_pfn = pages; + min_low_pfn = start; + return(init_bootmem_core(&contig_page_data, start, 0, pages)); +} + +void __init reserve_bootmem (unsigned long addr, unsigned long size) +{ + reserve_bootmem_core(contig_page_data.bdata, addr, size); +} + +void __init free_bootmem (unsigned long addr, unsigned long size) +{ + return(free_bootmem_core(contig_page_data.bdata, addr, size)); +} + +unsigned long __init free_all_bootmem (void) +{ + return(free_all_bootmem_core(&contig_page_data)); +} + +void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) +{ + pg_data_t *pgdat; + void *ptr; + + for_each_pgdat(pgdat) + if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + align, goal))) + return(ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) +{ + void *ptr; + + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + if (ptr) + return (ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + diff --git a/uClinux-2.4.31-uc0/mm/filemap.c b/uClinux-2.4.31-uc0/mm/filemap.c new file mode 100644 index 0000000..a31c553 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/filemap.c @@ -0,0 +1,3406 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/locks.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/blkdev.h> +#include <linux/file.h> +#include <linux/swapctl.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/iobuf.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/mman.h> + +#include <linux/highmem.h> + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> + */ + +unsigned long page_cache_size; +unsigned int page_hash_bits; +struct page **page_hash_table; + +int vm_max_readahead = 31; +int vm_min_readahead = 3; +EXPORT_SYMBOL(vm_max_readahead); +EXPORT_SYMBOL(vm_min_readahead); + + +spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED}; +/* + * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock + * with the pagecache_lock held. + * + * Ordering: + * swap_lock -> + * pagemap_lru_lock -> + * pagecache_lock + */ +spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; + +#define CLUSTER_PAGES (1 << page_cluster) +#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) + +static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p)); +static void fastcall add_page_to_hash_queue(struct page * page, struct page **p) +{ + struct page *next = *p; + + *p = page; + page->next_hash = next; + page->pprev_hash = p; + if (next) + next->pprev_hash = &page->next_hash; + if (page->buffers) + PAGE_BUG(page); + inc_nr_cache_pages(page); +} + +static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) +{ + struct list_head *head = &mapping->clean_pages; + + mapping->nrpages++; + list_add(&page->list, head); + page->mapping = mapping; +} + +static inline void remove_page_from_inode_queue(struct page * page) +{ + struct address_space * mapping = page->mapping; + + if (mapping->a_ops->removepage) + mapping->a_ops->removepage(page); + + list_del(&page->list); + page->mapping = NULL; + wmb(); + mapping->nrpages--; + if (!mapping->nrpages) + refile_inode(mapping->host); +} + +static inline void remove_page_from_hash_queue(struct page * page) +{ + struct page *next = page->next_hash; + struct page **pprev = page->pprev_hash; + + if (next) + next->pprev_hash = pprev; + *pprev = next; + page->pprev_hash = NULL; + dec_nr_cache_pages(page); +} + +/* + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. + */ +void __remove_inode_page(struct page *page) +{ + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); +} + +void remove_inode_page(struct page *page) +{ + if (!PageLocked(page)) + PAGE_BUG(page); + + spin_lock(&pagecache_lock); + __remove_inode_page(page); + spin_unlock(&pagecache_lock); +} + +static inline int sync_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + return mapping->a_ops->sync_page(page); + return 0; +} + +/* + * Add a page to the dirty page list. + */ +void fastcall set_page_dirty(struct page *page) +{ + if (!test_and_set_bit(PG_dirty, &page->flags)) { + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock(&pagecache_lock); + mapping = page->mapping; + if (mapping) { /* may have been truncated */ + list_del(&page->list); + list_add(&page->list, &mapping->dirty_pages); + } + spin_unlock(&pagecache_lock); + + if (mapping && mapping->host) + mark_inode_dirty_pages(mapping->host); + if (block_dump) + printk(KERN_DEBUG "%s: dirtied page\n", current->comm); + } + } +} + +/** + * invalidate_inode_pages - Invalidate all the unlocked pages of one inode + * @inode: the inode which pages we want to invalidate + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + */ + +void invalidate_inode_pages(struct inode * inode) +{ + struct list_head *head, *curr; + struct page * page; + + head = &inode->i_mapping->clean_pages; + + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + curr = head->next; + + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + + /* We cannot invalidate something in dirty.. */ + if (PageDirty(page)) + continue; + + /* ..or locked */ + if (TryLockPage(page)) + continue; + + if (page->buffers && !try_to_free_buffers(page, 0)) + goto unlock; + + if (page_count(page) != 1) + goto unlock; + + __lru_cache_del(page); + __remove_inode_page(page); + UnlockPage(page); + page_cache_release(page); + continue; +unlock: + UnlockPage(page); + continue; + } + + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); +} + +static int do_flushpage(struct page *page, unsigned long offset) +{ + int (*flushpage) (struct page *, unsigned long); + flushpage = page->mapping->a_ops->flushpage; + if (flushpage) + return (*flushpage)(page, offset); + return block_flushpage(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (page->buffers) + do_flushpage(page, partial); +} + +static void truncate_complete_page(struct page *page) +{ + /* Leave it on the LRU if it gets converted into anonymous buffers */ + if (!page->buffers || do_flushpage(page, 0)) + lru_cache_del(page); + + /* + * We remove the page from the page cache _after_ we have + * destroyed all buffer-cache references to it. Otherwise some + * other process might think this inode page is not in the + * page cache and creates a buffer-cache alias to it causing + * all sorts of fun problems ... + */ + ClearPageDirty(page); + ClearPageUptodate(page); + remove_inode_page(page); + page_cache_release(page); +} + +static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); +static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + unsigned long offset; + + page = list_entry(curr, struct page, list); + offset = page->index; + + /* Is one of the pages to truncate? */ + if ((offset >= start) || (*partial && (offset + 1) == start)) { + int failed; + + page_cache_get(page); + failed = TryLockPage(page); + + list_del(head); + if (!failed) + /* Restart after this page */ + list_add_tail(head, curr); + else + /* Restart on this page */ + list_add(head, curr); + + spin_unlock(&pagecache_lock); + unlocked = 1; + + if (!failed) { + if (*partial && (offset + 1) == start) { + truncate_partial_page(page, *partial); + *partial = 0; + } else + truncate_complete_page(page); + + UnlockPage(page); + } else + wait_on_page(page); + + page_cache_release(page); + + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + curr = curr->prev; + } + return unlocked; +} + + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * + * Truncate the page cache at a set offset, removing the pages + * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. + */ +void truncate_inode_pages(struct address_space * mapping, loff_t lstart) +{ + unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); + } while (unlocked); + /* Traversed all three lists without dropping the lock */ + spin_unlock(&pagecache_lock); +} + +static inline int invalidate_this_page2(struct page * page, + struct list_head * curr, + struct list_head * head) +{ + int unlocked = 1; + + /* + * The page is locked and we hold the pagecache_lock as well + * so both page_count(page) and page->buffers stays constant here. + */ + if (page_count(page) == 1 + !!page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + truncate_complete_page(page); + } else { + if (page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + block_invalidate_page(page); + } else + unlocked = 0; + + ClearPageDirty(page); + ClearPageUptodate(page); + } + + return unlocked; +} + +static int FASTCALL(invalidate_list_pages2(struct list_head *)); +static int fastcall invalidate_list_pages2(struct list_head *head) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + page = list_entry(curr, struct page, list); + + if (!TryLockPage(page)) { + int __unlocked; + + __unlocked = invalidate_this_page2(page, curr, head); + UnlockPage(page); + unlocked |= __unlocked; + if (!__unlocked) { + curr = curr->prev; + continue; + } + } else { + /* Restart on this page */ + list_del(head); + list_add(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + unlocked = 1; + wait_on_page(page); + } + + page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + return unlocked; +} + +/** + * invalidate_inode_pages2 - Clear all the dirty bits around if it can't + * free the pages because they're mapped. + * @mapping: the address_space which pages we want to invalidate + */ +void invalidate_inode_pages2(struct address_space * mapping) +{ + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = invalidate_list_pages2(&mapping->clean_pages); + unlocked |= invalidate_list_pages2(&mapping->dirty_pages); + unlocked |= invalidate_list_pages2(&mapping->locked_pages); + } while (unlocked); + spin_unlock(&pagecache_lock); +} + +static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) +{ + goto inside; + + for (;;) { + page = page->next_hash; +inside: + if (!page) + goto not_found; + if (page->mapping != mapping) + continue; + if (page->index == offset) + break; + } + +not_found: + return page; +} + +static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) +{ + struct list_head *curr; + struct page *page; + int retval = 0; + + spin_lock(&pagecache_lock); + curr = head->next; + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + if (!page->buffers) + continue; + if (page->index >= end) + continue; + if (page->index < start) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + lock_page(page); + + /* The buffers could have been free'd while we waited for the page lock */ + if (page->buffers) + retval |= fn(page); + + UnlockPage(page); + spin_lock(&pagecache_lock); + curr = page->list.next; + page_cache_release(page); + } + spin_unlock(&pagecache_lock); + + return retval; +} + +/* + * Two-stage data sync: first start the IO, then go back and + * collect the information.. + */ +int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx) +{ + int retval; + + /* writeout dirty buffers on pages from both clean and dirty lists */ + retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page); + + /* now wait for locked buffers on pages from both clean and dirty lists */ + retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page); + + return retval; +} + +/* + * In-memory filesystems have to fail their + * writepage function - and this has to be + * worked around in the VM layer.. + * + * We + * - mark the page dirty again (but do NOT + * add it back to the inode dirty list, as + * that would livelock in fdatasync) + * - activate the page so that the page stealer + * doesn't try to write it out over and over + * again. + */ +int fail_writepage(struct page *page) +{ + /* Only activate on memory-pressure, not fsync.. */ + if (PageLaunder(page)) { + activate_page(page); + SetPageReferenced(page); + } + + /* Set the page dirty again, unlock */ + SetPageDirty(page); + UnlockPage(page); + return 0; +} + +EXPORT_SYMBOL(fail_writepage); + +/** + * filemap_fdatawrite - walk the list of dirty pages of the given address space + * and writepage() each unlocked page (does not wait on locked pages). + * + * @mapping: address space structure to write + * + */ +int filemap_fdatawrite(struct address_space * mapping) +{ + int ret = 0; + int (*writepage)(struct page *) = mapping->a_ops->writepage; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->dirty_pages)) { + struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->locked_pages); + + if (!PageDirty(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + if (!TryLockPage(page)) { + if (PageDirty(page)) { + int err; + ClearPageDirty(page); + err = writepage(page); + if (err && !ret) + ret = err; + } else + UnlockPage(page); + } + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); + return ret; +} + +/** + * filemap_fdatasync - walk the list of dirty pages of the given address space + * and writepage() all of them. + * + * @mapping: address space structure to write + * + */ +int filemap_fdatasync(struct address_space * mapping) +{ + int ret = 0; + int (*writepage)(struct page *) = mapping->a_ops->writepage; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->dirty_pages)) { + struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->locked_pages); + + if (!PageDirty(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + lock_page(page); + + if (PageDirty(page)) { + int err; + ClearPageDirty(page); + err = writepage(page); + if (err && !ret) + ret = err; + } else + UnlockPage(page); + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); + return ret; +} + +/** + * filemap_fdatawait - walk the list of locked pages of the given address space + * and wait for all of them. + * + * @mapping: address space structure to wait for + * + */ +int filemap_fdatawait(struct address_space * mapping) +{ + int ret = 0; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->locked_pages)) { + struct page *page = list_entry(mapping->locked_pages.next, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->clean_pages); + + if (!PageLocked(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + ___wait_on_page(page); + if (PageError(page)) + ret = -EIO; + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); + return ret; +} + +/* + * Add a page to the inode page cache. + * + * The caller must have locked the page and + * set all the page flags correctly.. + */ +void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) +{ + if (!PageLocked(page)) + BUG(); + + page->index = index; + page_cache_get(page); + spin_lock(&pagecache_lock); + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, page_hash(mapping, index)); + spin_unlock(&pagecache_lock); + + lru_cache_add(page); +} + +/* + * This adds a page to the page cache, starting out as locked, + * owned by us, but unreferenced, not uptodate and with no errors. + */ +static inline void __add_to_page_cache(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + /* + * Yes this is inefficient, however it is needed. The problem + * is that we could be adding a page to the swap cache while + * another CPU is also modifying page->flags, so the updates + * really do need to be atomic. -- Rik + */ + ClearPageUptodate(page); + ClearPageError(page); + ClearPageDirty(page); + ClearPageReferenced(page); + ClearPageArch1(page); + ClearPageChecked(page); + LockPage(page); + page_cache_get(page); + page->index = offset; + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, hash); +} + +void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) +{ + spin_lock(&pagecache_lock); + __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); + spin_unlock(&pagecache_lock); + lru_cache_add(page); +} + +int add_to_page_cache_unique(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + int err; + struct page *alias; + + spin_lock(&pagecache_lock); + alias = __find_page_nolock(mapping, offset, *hash); + + err = 1; + if (!alias) { + __add_to_page_cache(page,mapping,offset,hash); + err = 0; + } + + spin_unlock(&pagecache_lock); + if (!err) + lru_cache_add(page); + return err; +} + +/* + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int fastcall page_cache_read(struct file * file, unsigned long offset) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct page **hash = page_hash(mapping, offset); + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + if (page) + return 0; + + page = page_cache_alloc(mapping); + if (!page) + return -ENOMEM; + + if (!add_to_page_cache_unique(page, mapping, offset, hash)) { + int error = mapping->a_ops->readpage(file, page); + page_cache_release(page); + return error; + } + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first. + */ + page_cache_release(page); + return 0; +} + +/* + * Read in an entire cluster at once. A cluster is usually a 64k- + * aligned block that includes the page requested in "offset." + */ +static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize)); +static int fastcall read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize) +{ + unsigned long pages = CLUSTER_PAGES; + + offset = CLUSTER_OFFSET(offset); + while ((pages-- > 0) && (offset < filesize)) { + int error = page_cache_read(file, offset); + if (error < 0) + return error; + offset ++; + } + + return 0; +} + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +#if BITS_PER_LONG == 32 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL +#elif BITS_PER_LONG == 64 +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL +#else +#error Define GOLDEN_RATIO_PRIME for your wordsize. +#endif + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +static inline wait_queue_head_t *page_waitqueue(struct page *page) +{ + const zone_t *zone = page_zone(page); + wait_queue_head_t *wait = zone->wait_table; + unsigned long hash = (unsigned long)page; + +#if BITS_PER_LONG == 64 + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + unsigned long n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; +#else + /* On some cpus multiply is faster, on others gcc will do shifts */ + hash *= GOLDEN_RATIO_PRIME; +#endif + hash >>= zone->wait_table_shift; + + return &wait[hash]; +} + +/* + * This must be called after every submit_bh with end_io + * callbacks that would result into the blkdev layer waking + * up the page after a queue unplug. + */ +void fastcall wakeup_page_waiters(struct page * page) +{ + wait_queue_head_t * head; + + head = page_waitqueue(page); + if (waitqueue_active(head)) + wake_up(head); +} + +/* + * Wait for a page to get unlocked. + * + * This must be called with the caller "holding" the page, + * ie with increased "page->count" so that the page won't + * go away during the wait.. + * + * The waiting strategy is to get on a waitqueue determined + * by hashing. Waiters will then collide, and the newly woken + * task must then determine whether it was woken for the page + * it really wanted, and go back to sleep on the waitqueue if + * that wasn't it. With the waitqueue semantics, it never leaves + * the waitqueue unless it calls, so the loop moves forward one + * iteration every time there is + * (1) a collision + * and + * (2) one of the colliding pages is woken + * + * This is the thundering herd problem, but it is expected to + * be very rare due to the few pages that are actually being + * waited on at any given time and the quality of the hash function. + */ +void ___wait_on_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(waitqueue, &wait); + do { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PageLocked(page)) + break; + sync_page(page); + schedule(); + } while (PageLocked(page)); + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(waitqueue, &wait); +} + +/* + * unlock_page() is the other half of the story just above + * __wait_on_page(). Here a couple of quick checks are done + * and a couple of flags are set on the page, and then all + * of the waiters for all of the pages in the appropriate + * wait queue are woken. + */ +void fastcall unlock_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + ClearPageLaunder(page); + smp_mb__before_clear_bit(); + if (!test_and_clear_bit(PG_locked, &(page)->flags)) + BUG(); + smp_mb__after_clear_bit(); + + /* + * Although the default semantics of wake_up() are + * to wake all, here the specific function is used + * to make it even more explicit that a number of + * pages are being waited on here. + */ + if (waitqueue_active(waitqueue)) + wake_up_all(waitqueue); +} + +/* + * Get a lock on the page, assuming we need to sleep + * to get it.. + */ +static void __lock_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue_exclusive(waitqueue, &wait); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (PageLocked(page)) { + sync_page(page); + schedule(); + } + if (!TryLockPage(page)) + break; + } + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(waitqueue, &wait); +} + +/* + * Get an exclusive lock on the page, optimistically + * assuming it's not locked.. + */ +void fastcall lock_page(struct page *page) +{ + if (TryLockPage(page)) + __lock_page(page); +} + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically. + */ +struct page * __find_get_page(struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) + page_cache_get(page); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but trylock it instead of incrementing the count. + */ +struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + struct page **hash = page_hash(mapping, offset); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) { + if (TryLockPage(page)) + page = NULL; + } + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Must be called with the pagecache lock held, + * will return with it held (but it may be dropped + * during blocking operations.. + */ +static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *)); +static struct page * fastcall __find_lock_page_helper(struct address_space *mapping, + unsigned long offset, struct page *hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ +repeat: + page = __find_page_nolock(mapping, offset, hash); + if (page) { + page_cache_get(page); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + lock_page(page); + spin_lock(&pagecache_lock); + + /* Has the page been re-allocated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + UnlockPage(page); + page_cache_release(page); + goto repeat; + } + } + } + return page; +} + +/* + * Same as the above, but lock the page too, verifying that + * it's still valid once we own it. + */ +struct page * __find_lock_page (struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but create the page if required.. + */ +struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask) +{ + struct page *page; + struct page **hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + spin_unlock(&pagecache_lock); + if (!page) { + struct page *newpage = alloc_page(gfp_mask); + if (newpage) { + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + if (likely(!page)) { + page = newpage; + __add_to_page_cache(page, mapping, index, hash); + newpage = NULL; + } + spin_unlock(&pagecache_lock); + if (newpage == NULL) + lru_cache_add(page); + else + page_cache_release(newpage); + } + } + return page; +} + +/* + * Same as grab_cache_page, but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + */ +struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +{ + struct page *page, **hash; + + hash = page_hash(mapping, index); + page = __find_get_page(mapping, index, hash); + + if ( page ) { + if ( !TryLockPage(page) ) { + /* Page found and locked */ + /* This test is overly paranoid, but what the heck... */ + if ( unlikely(page->mapping != mapping || page->index != index) ) { + /* Someone reallocated this page under us. */ + UnlockPage(page); + page_cache_release(page); + return NULL; + } else { + return page; + } + } else { + /* Page locked by someone else */ + page_cache_release(page); + return NULL; + } + } + + page = page_cache_alloc(mapping); + if ( unlikely(!page) ) + return NULL; /* Failed to allocate a page */ + + if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) { + /* Someone else grabbed the page already. */ + page_cache_release(page); + return NULL; + } + + return page; +} + +#if 0 +#define PROFILE_READAHEAD +#define DEBUG_READAHEAD +#endif + +/* + * Read-ahead profiling information + * -------------------------------- + * Every PROFILE_MAXREADCOUNT, the following information is written + * to the syslog: + * Percentage of asynchronous read-ahead. + * Average of read-ahead fields context value. + * If DEBUG_READAHEAD is defined, a snapshot of these fields is written + * to the syslog. + */ + +#ifdef PROFILE_READAHEAD + +#define PROFILE_MAXREADCOUNT 1000 + +static unsigned long total_reada; +static unsigned long total_async; +static unsigned long total_ramax; +static unsigned long total_ralen; +static unsigned long total_rawin; + +static void profile_readahead(int async, struct file *filp) +{ + unsigned long flags; + + ++total_reada; + if (async) + ++total_async; + + total_ramax += filp->f_ramax; + total_ralen += filp->f_ralen; + total_rawin += filp->f_rawin; + + if (total_reada > PROFILE_MAXREADCOUNT) { + save_flags(flags); + cli(); + if (!(total_reada > PROFILE_MAXREADCOUNT)) { + restore_flags(flags); + return; + } + + printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n", + total_ramax/total_reada, + total_ralen/total_reada, + total_rawin/total_reada, + (total_async*100)/total_reada); +#ifdef DEBUG_READAHEAD + printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n", + filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); +#endif + + total_reada = 0; + total_async = 0; + total_ramax = 0; + total_ralen = 0; + total_rawin = 0; + + restore_flags(flags); + } +} +#endif /* defined PROFILE_READAHEAD */ + +/* + * Read-ahead context: + * ------------------- + * The read ahead context fields of the "struct file" are the following: + * - f_raend : position of the first byte after the last page we tried to + * read ahead. + * - f_ramax : current read-ahead maximum size. + * - f_ralen : length of the current IO read block we tried to read-ahead. + * - f_rawin : length of the current read-ahead window. + * if last read-ahead was synchronous then + * f_rawin = f_ralen + * otherwise (was asynchronous) + * f_rawin = previous value of f_ralen + f_ralen + * + * Read-ahead limits: + * ------------------ + * MIN_READAHEAD : minimum read-ahead size when read-ahead. + * MAX_READAHEAD : maximum read-ahead size when read-ahead. + * + * Synchronous read-ahead benefits: + * -------------------------------- + * Using reasonable IO xfer length from peripheral devices increase system + * performances. + * Reasonable means, in this context, not too large but not too small. + * The actual maximum value is: + * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined + * and 32K if defined (4K page size assumed). + * + * Asynchronous read-ahead benefits: + * --------------------------------- + * Overlapping next read request and user process execution increase system + * performance. + * + * Read-ahead risks: + * ----------------- + * We have to guess which further data are needed by the user process. + * If these data are often not really needed, it's bad for system + * performances. + * However, we know that files are often accessed sequentially by + * application programs and it seems that it is possible to have some good + * strategy in that guessing. + * We only try to read-ahead files that seems to be read sequentially. + * + * Asynchronous read-ahead risks: + * ------------------------------ + * In order to maximize overlapping, we must start some asynchronous read + * request from the device, as soon as possible. + * We must be very careful about: + * - The number of effective pending IO read requests. + * ONE seems to be the only reasonable value. + * - The total memory pool usage for the file access stream. + * This maximum memory usage is implicitly 2 IO read chunks: + * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined, + * 64k if defined (4K page size assumed). + */ + +static inline int get_max_readahead(struct inode * inode) +{ + if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)]) + return vm_max_readahead; + return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; +} + +static void generic_file_readahead(int reada_ok, + struct file * filp, struct inode * inode, + struct page * page) +{ + unsigned long end_index; + unsigned long index = page->index; + unsigned long max_ahead, ahead; + unsigned long raend; + int max_readahead = get_max_readahead(inode); + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + raend = filp->f_raend; + max_ahead = 0; + +/* + * The current page is locked. + * If the current position is inside the previous read IO request, do not + * try to reread previously read ahead pages. + * Otherwise decide or not to read ahead some pages synchronously. + * If we are not going to read ahead, set the read ahead context for this + * page only. + */ + if (PageLocked(page)) { + if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) { + raend = index; + if (raend < end_index) + max_ahead = filp->f_ramax; + filp->f_rawin = 0; + filp->f_ralen = 1; + if (!max_ahead) { + filp->f_raend = index + filp->f_ralen; + filp->f_rawin += filp->f_ralen; + } + } + } +/* + * The current page is not locked. + * If we were reading ahead and, + * if the current max read ahead size is not zero and, + * if the current position is inside the last read-ahead IO request, + * it is the moment to try to read ahead asynchronously. + * We will later force unplug device in order to force asynchronous read IO. + */ + else if (reada_ok && filp->f_ramax && raend >= 1 && + index <= raend && index + filp->f_ralen >= raend) { +/* + * Add ONE page to max_ahead in order to try to have about the same IO max size + * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. + * Compute the position of the last page we have tried to read in order to + * begin to read ahead just at the next page. + */ + raend -= 1; + if (raend < end_index) + max_ahead = filp->f_ramax + 1; + + if (max_ahead) { + filp->f_rawin = filp->f_ralen; + filp->f_ralen = 0; + reada_ok = 2; + } + } +/* + * Try to read ahead pages. + * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the + * scheduler, will work enough for us to avoid too bad actuals IO requests. + */ + ahead = 0; + while (ahead < max_ahead) { + unsigned long ra_index = raend + ahead + 1; + + if (ra_index >= end_index) + break; + if (page_cache_read(filp, ra_index) < 0) + break; + + ahead++; + } +/* + * If we tried to read ahead some pages, + * If we tried to read ahead asynchronously, + * Try to force unplug of the device in order to start an asynchronous + * read IO request. + * Update the read-ahead context. + * Store the length of the current read-ahead window. + * Double the current max read ahead size. + * That heuristic avoid to do some large IO for files that are not really + * accessed sequentially. + */ + if (ahead) { + filp->f_ralen += ahead; + filp->f_rawin += filp->f_ralen; + filp->f_raend = raend + ahead + 1; + + filp->f_ramax += filp->f_ramax; + + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + +#ifdef PROFILE_READAHEAD + profile_readahead((reada_ok == 2), filp); +#endif + } + + return; +} + +/* + * Mark a page as having seen activity. + * + * If it was already so marked, move it to the active queue and drop + * the referenced bit. Otherwise, just mark it for future action.. + */ +void fastcall mark_page_accessed(struct page *page) +{ + if (!PageActive(page) && PageReferenced(page)) { + activate_page(page); + ClearPageReferenced(page); + } else + SetPageReferenced(page); +} + +/* + * This is a generic file read routine, and uses the + * inode->i_op->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) +{ + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long index, offset; + struct page *cached_page; + int reada_ok; + int error; + int max_readahead = get_max_readahead(inode); + + cached_page = NULL; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + +/* + * If the current position is outside the previous read-ahead window, + * we reset the current read-ahead context and set read ahead max to zero + * (will be set to just needed value later), + * otherwise, we assume that the file accesses are sequential enough to + * continue read-ahead. + */ + if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { + reada_ok = 0; + filp->f_raend = 0; + filp->f_ralen = 0; + filp->f_ramax = 0; + filp->f_rawin = 0; + } else { + reada_ok = 1; + } +/* + * Adjust the current value of read-ahead max. + * If the read operation stay in the first half page, force no readahead. + * Otherwise try to increase read ahead max just enough to do the read request. + * Then, at least MIN_READAHEAD if read ahead is ok, + * and at most MAX_READAHEAD in all cases. + */ + if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { + filp->f_ramax = 0; + } else { + unsigned long needed; + + needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; + + if (filp->f_ramax < needed) + filp->f_ramax = needed; + + if (reada_ok && filp->f_ramax < vm_min_readahead) + filp->f_ramax = vm_min_readahead; + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + } + + for (;;) { + struct page *page, **hash; + unsigned long end_index, nr, ret; + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + if (index > end_index) + break; + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + nr = nr - offset; + + /* + * Try to find the data in the page cache.. + */ + hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (!page) + goto no_cached_page; +found_page: + page_cache_get(page); + spin_unlock(&pagecache_lock); + + if (!Page_Uptodate(page)) + goto page_not_up_to_date; + generic_file_readahead(reada_ok, filp, inode, page); +page_ok: + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping->i_mmap_shared != NULL) + flush_dcache_page(page); + + /* + * Mark the page accessed if we read the + * beginning or we just did an lseek. + */ + if (!offset || !filp->f_reada) + mark_page_accessed(page); + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret == nr && desc->count) + continue; + break; + +/* + * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. + */ +page_not_up_to_date: + generic_file_readahead(reada_ok, filp, inode, page); + + if (Page_Uptodate(page)) + goto page_ok; + + /* Get exclusive access to the page ... */ + lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto page_ok; + } + +readpage: + /* ... and start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (!error) { + if (Page_Uptodate(page)) + goto page_ok; + + /* Again, try some read-ahead while waiting for the page to finish.. */ + generic_file_readahead(reada_ok, filp, inode, page); + wait_on_page(page); + if (Page_Uptodate(page)) + goto page_ok; + error = -EIO; + } + + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + break; + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + * + * We get here with the page cache lock held. + */ + if (!cached_page) { + spin_unlock(&pagecache_lock); + cached_page = page_cache_alloc(mapping); + if (!cached_page) { + desc->error = -ENOMEM; + break; + } + + /* + * Somebody may have added the page while we + * dropped the page cache lock. Check for that. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (page) + goto found_page; + } + + /* + * Ok, add the new page to the hash-queues... + */ + page = cached_page; + __add_to_page_cache(page, mapping, index, hash); + spin_unlock(&pagecache_lock); + lru_cache_add(page); + cached_page = NULL; + + goto readpage; + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + filp->f_reada = 1; + if (cached_page) + page_cache_release(cached_page); + UPDATE_ATIME(inode); +} + +static inline int have_mapping_directIO(struct address_space * mapping) +{ + return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO; +} + +/* Switch between old and new directIO formats */ +static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize) +{ + struct address_space * mapping = filp->f_dentry->d_inode->i_mapping; + + if (mapping->a_ops->direct_fileIO) + return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize); + return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize); +} + +/* + * i_sem and i_alloc_sem should be held already. i_sem may be dropped + * later once we've mapped the new IO. i_alloc_sem is kept until the IO + * completes. + */ + +static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) +{ + ssize_t retval, progress; + int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits; + ssize_t iosize; + struct kiobuf * iobuf; + struct address_space * mapping = filp->f_dentry->d_inode->i_mapping; + struct inode * inode = mapping->host; + loff_t size = inode->i_size; + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + retval = alloc_kiovec(1, &iobuf); + if (retval) + goto out; + new_iobuf = 1; + } + + blocksize = 1 << inode->i_blkbits; + blocksize_bits = inode->i_blkbits; + blocksize_mask = blocksize - 1; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + + retval = -EINVAL; + if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask)) + goto out_free; + if (!have_mapping_directIO(mapping)) + goto out_free; + + if ((rw == READ) && (offset + count > size)) + count = size - offset; + + /* + * Flush to disk exclusively the _data_, metadata must remain + * completly asynchronous or performance will go to /dev/null. + */ + retval = filemap_fdatasync(mapping); + if (retval == 0) + retval = fsync_inode_data_buffers(inode); + if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval < 0) + goto out_free; + + progress = retval = 0; + while (count > 0) { + iosize = count; + if (iosize > chunk_size) + iosize = chunk_size; + + retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (retval) + break; + + retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize); + + if (rw == READ && retval > 0) + mark_dirty_kiobuf(iobuf, retval); + + if (retval >= 0) { + count -= retval; + buf += retval; + /* warning: weird semantics here, we're reporting a read behind the end of the file */ + progress += retval; + } + + unmap_kiobuf(iobuf); + + if (retval != iosize) + break; + } + + if (progress) + retval = progress; + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return retval; +} + +int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + left = __copy_to_user(desc->buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } + desc->count = count - size; + desc->written += size; + desc->buf += size; + return size; +} + +inline ssize_t do_generic_direct_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + ssize_t retval; + loff_t pos = *ppos; + + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + return retval; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + ssize_t retval; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + goto o_direct; + + retval = -EFAULT; + if (access_ok(VERIFY_WRITE, buf, count)) { + retval = 0; + + if (count) { + read_descriptor_t desc; + + desc.written = 0; + desc.count = count; + desc.buf = buf; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + } + } + out: + return retval; + + o_direct: + { + loff_t size; + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + retval = 0; + if (!count) + goto out; /* skip atime */ + down_read(&inode->i_alloc_sem); + down(&inode->i_sem); + size = inode->i_size; + if (*ppos < size) + retval = do_generic_direct_read(filp, buf, count, ppos); + up(&inode->i_sem); + up_read(&inode->i_alloc_sem); + UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } +} + +static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = (struct file *) desc->buf; + + if (size > count) + size = count; + + if (file->f_op->sendpage) { + written = file->f_op->sendpage(file, page, offset, + size, &file->f_pos, size<count); + } else { + char *kaddr; + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + kaddr = kmap(page); + written = file->f_op->write(file, kaddr + offset, size, &file->f_pos); + kunmap(page); + + set_fs(old_fs); + } + if (written < 0) { + desc->error = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count) +{ + ssize_t retval; + struct file * in_file, * out_file; + struct inode * in_inode, * out_inode; + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in_file = fget(in_fd); + if (!in_file) + goto out; + if (!(in_file->f_mode & FMODE_READ)) + goto fput_in; + retval = -EINVAL; + in_inode = in_file->f_dentry->d_inode; + if (!in_inode) + goto fput_in; + if (!in_inode->i_mapping->a_ops->readpage) + goto fput_in; + retval = rw_verify_area(READ, in_file, &in_file->f_pos, count); + if (retval) + goto fput_in; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out_file = fget(out_fd); + if (!out_file) + goto fput_in; + if (!(out_file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + if (!out_file->f_op || !out_file->f_op->write) + goto fput_out; + out_inode = out_file->f_dentry->d_inode; + retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); + if (retval) + goto fput_out; + + retval = 0; + if (count) { + read_descriptor_t desc; + + if (!offset) + offset = &in_file->f_pos; + + desc.written = 0; + desc.count = count; + desc.buf = (char *) out_file; + desc.error = 0; + do_generic_file_read(in_file, offset, &desc, file_send_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + } + +fput_out: + fput(out_file); +fput_in: + fput(in_file); +out: + return retval; +} + +asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + loff_t pos, *ppos = NULL; + ssize_t ret; + if (offset) { + off_t off; + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ppos = &pos; + } + ret = common_sendfile(out_fd, in_fd, ppos, count); + if (offset) + put_user((off_t)pos, offset); + return ret; +} + +asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count) +{ + loff_t pos, *ppos = NULL; + ssize_t ret; + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ppos = &pos; + } + ret = common_sendfile(out_fd, in_fd, ppos, count); + if (offset) + put_user(pos, offset); + return ret; +} + +static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + unsigned long max; + + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + /* Limit it to the size of the file.. */ + max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT; + if (index > max) + return 0; + max -= index; + if (nr > max) + nr = max; + + /* And limit it to a sane percentage of the inactive list.. */ + max = (nr_free_pages() + nr_inactive_pages) / 2; + if (nr > max) + nr = max; + + while (nr) { + page_cache_read(file, index); + index++; + nr--; + } + return 0; +} + +asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + unsigned long start = offset >> PAGE_CACHE_SHIFT; + unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT; + ret = do_readahead(file, start, len); + } + fput(file); + } + return ret; +} + +/* + * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are + * sure this is sequential access, we don't need a flexible read-ahead + * window size -- we can always use a large fixed size window. + */ +static void nopage_sequential_readahead(struct vm_area_struct * vma, + unsigned long pgoff, unsigned long filesize) +{ + unsigned long ra_window; + + ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); + ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); + + /* vm_raend is zero if we haven't read ahead in this area yet. */ + if (vma->vm_raend == 0) + vma->vm_raend = vma->vm_pgoff + ra_window; + + /* + * If we've just faulted the page half-way through our window, + * then schedule reads for the next window, and release the + * pages in the previous window. + */ + if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { + unsigned long start = vma->vm_pgoff + vma->vm_raend; + unsigned long end = start + ra_window; + + if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) + end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; + if (start > end) + return; + + while ((start < end) && (start < filesize)) { + if (read_cluster_nonblocking(vma->vm_file, + start, filesize) < 0) + break; + start += CLUSTER_PAGES; + } + run_task_queue(&tq_disk); + + /* if we're far enough past the beginning of this area, + recycle pages that are in the previous window. */ + if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { + unsigned long window = ra_window << PAGE_SHIFT; + + end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT); + end -= window + window; + filemap_sync(vma, end - window, window, MS_INVALIDATE); + } + + vma->vm_raend += ra_window; + } + + return; +} + +/* + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) +{ + int error; + struct file *file = area->vm_file; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + struct page *page, **hash; + unsigned long size, pgoff, endoff; + + pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + +retry_all: + /* + * An external ptracer can access pages that normally aren't + * accessible.. + */ + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if ((pgoff >= size) && (area->vm_mm == current->mm)) + return NULL; + + /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ + if (size > endoff) + size = endoff; + + /* + * Do we have something in the page cache already? + */ + hash = page_hash(mapping, pgoff); +retry_find: + page = __find_get_page(mapping, pgoff, hash); + if (!page) + goto no_cached_page; + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!Page_Uptodate(page)) + goto page_not_uptodate; + +success: + /* + * Try read-ahead for sequential areas. + */ + if (VM_SequentialReadHint(area)) + nopage_sequential_readahead(area, pgoff, size); + + /* + * Found the page and have a reference on it, need to check sharing + * and possibly copy it over to another page.. + */ + mark_page_accessed(page); + flush_page_to_ram(page); + return page; + +no_cached_page: + /* + * If the requested offset is within our file, try to read a whole + * cluster of pages at once. + * + * Otherwise, we're off the end of a privately mapped file, + * so we need to map a zero page. + */ + if ((pgoff < size) && !VM_RandomReadHint(area)) + error = read_cluster_nonblocking(file, pgoff, size); + else + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; + +page_not_uptodate: + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ + page_cache_release(page); + return NULL; +} + +/* Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ +static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t pte = *ptep; + + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) { + flush_tlb_page(vma, address); + set_page_dirty(page); + } + } + return 0; +} + +static inline int filemap_sync_pte_range(pmd_t * pmd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned long offset, unsigned int flags) +{ + pte_t * pte; + unsigned long end; + int error; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + pte = pte_offset(pmd, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + error = 0; + do { + error |= filemap_sync_pte(pte, vma, address + offset, flags); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + return error; +} + +static inline int filemap_sync_pmd_range(pgd_t * pgd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned int flags) +{ + pmd_t * pmd; + unsigned long offset, end; + int error; + + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return 0; + } + pmd = pmd_offset(pgd, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + error = 0; + do { + error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return error; +} + +int filemap_sync(struct vm_area_struct * vma, unsigned long address, + size_t size, unsigned int flags) +{ + pgd_t * dir; + unsigned long end = address + size; + int error = 0; + + /* Aquire the lock early; it may be possible to avoid dropping + * and reaquiring it repeatedly. + */ + spin_lock(&vma->vm_mm->page_table_lock); + + dir = pgd_offset(vma->vm_mm, address); + flush_cache_range(vma->vm_mm, end - size, end); + if (address >= end) + BUG(); + do { + error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + flush_tlb_range(vma->vm_mm, end - size, end); + + spin_unlock(&vma->vm_mm->page_table_lock); + + return error; +} + +static struct vm_operations_struct generic_file_vm_ops = { + nopage: filemap_nopage, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + if (!mapping->a_ops->writepage) + return -EINVAL; + } + if (!mapping->a_ops->readpage) + return -ENOEXEC; + UPDATE_ATIME(inode); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * The msync() system call. + */ + +/* + * MS_SYNC syncs the entire file - including mappings. + * + * MS_ASYNC initiates writeout of just the dirty mapped data. + * This provides no guarantee of file integrity - things like indirect + * blocks may not have started writeout. MS_ASYNC is primarily useful + * where the application knows that it has finished with the data and + * wishes to intelligently schedule its own I/O traffic. + */ +static int msync_interval(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int flags) +{ + int ret = 0; + struct file * file = vma->vm_file; + + if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) ) + return -EBUSY; + + if (file && (vma->vm_flags & VM_SHARED)) { + ret = filemap_sync(vma, start, end-start, flags); + + if (!ret && (flags & (MS_SYNC|MS_ASYNC))) { + struct inode * inode = file->f_dentry->d_inode; + + down(&inode->i_sem); + ret = filemap_fdatasync(inode->i_mapping); + if (flags & MS_SYNC) { + int err; + + if (file->f_op && file->f_op->fsync) { + err = file->f_op->fsync(file, file->f_dentry, 1); + if (err && !ret) + ret = err; + } + err = filemap_fdatawait(inode->i_mapping); + if (err && !ret) + ret = err; + } + up(&inode->i_sem); + } + } + return ret; +} + +asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error, error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + if ((flags & MS_ASYNC) && (flags & MS_SYNC)) + goto out; + + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + unmapped_error = 0; + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = msync_interval(vma, start, end, flags); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline void setup_read_behavior(struct vm_area_struct * vma, + int behavior) +{ + VM_ClearReadHint(vma); + switch(behavior) { + case MADV_SEQUENTIAL: + vma->vm_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + vma->vm_flags |= VM_RAND_READ; + break; + default: + break; + } + return; +} + +static long madvise_fixup_start(struct vm_area_struct * vma, + unsigned long end, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_end = end; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = end; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_end(struct vm_area_struct * vma, + unsigned long start, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_end = start; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + struct vm_area_struct * left, * right; + struct mm_struct * mm = vma->vm_mm; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -EAGAIN; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; + if (vma->vm_file) + atomic_add(2, &vma->vm_file->f_count); + + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + vma->vm_raend = 0; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = start; + vma->vm_end = end; + setup_read_behavior(vma, behavior); + __insert_vm_struct(mm, left); + __insert_vm_struct(mm, right); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + int error = 0; + + /* This caps the number of vma's this process can own */ + if (vma->vm_mm->map_count > max_map_count) + return -ENOMEM; + + if (start == vma->vm_start) { + if (end == vma->vm_end) { + setup_read_behavior(vma, behavior); + vma->vm_raend = 0; + } else + error = madvise_fixup_start(vma, end, behavior); + } else { + if (end == vma->vm_end) + error = madvise_fixup_end(vma, start, behavior); + else + error = madvise_fixup_middle(vma, start, end, behavior); + } + + return error; +} + +/* + * Schedule all required I/O operations, then run the disk queue + * to make sure they are started. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + long error = -EBADF; + struct file * file; + struct inode * inode; + unsigned long size; + + /* Doesn't work if there's no mapped file. */ + if (!vma->vm_file) + return error; + file = vma->vm_file; + inode = file->f_dentry->d_inode; + if (!inode->i_mapping->a_ops->readpage) + return error; + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + error = -EIO; + + /* round to cluster boundaries if this isn't a "random" area. */ + if (!VM_RandomReadHint(vma)) { + start = CLUSTER_OFFSET(start); + end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1); + + while ((start < end) && (start < size)) { + error = read_cluster_nonblocking(file, start, size); + start += CLUSTER_PAGES; + if (error < 0) + break; + } + } else { + while ((start < end) && (start < size)) { + error = page_cache_read(file, start); + start++; + if (error < 0) + break; + } + } + + /* Don't wait for someone else to push these requests. */ + run_task_queue(&tq_disk); + + return error; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for refill_inactive to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * refill_inactive to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_flags & VM_LOCKED) + return -EINVAL; + + zap_page_range(vma->vm_mm, start, end - start); + return 0; +} + +static long madvise_vma(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + long error = -EBADF; + + switch (behavior) { + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, start, end, behavior); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, start, end); + break; + + default: + error = -EINVAL; + break; + } + + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + int error = -EINVAL; + + down_write(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = madvise_vma(vma, start, end, + behavior); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = madvise_vma(vma, start, vma->vm_end, behavior); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_write(¤t->mm->mmap_sem); + return error; +} + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct vm_area_struct * vma, + unsigned long pgoff) +{ + unsigned char present = 0; + struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; + struct page * page, ** hash = page_hash(as, pgoff); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(as, pgoff, *hash); + if ((page) && (Page_Uptodate(page))) + present = 1; + spin_unlock(&pagecache_lock); + + return present; +} + +static long mincore_vma(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned char * vec) +{ + long error, i, remaining; + unsigned char * tmp; + + error = -ENOMEM; + if (!vma->vm_file) + return error; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + error = -EAGAIN; + tmp = (unsigned char *) __get_free_page(GFP_KERNEL); + if (!tmp) + return error; + + /* (end - start) is # of pages, and also # of bytes in "vec */ + remaining = (end - start), + + error = 0; + for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { + int j = 0; + long thispiece = (remaining < PAGE_SIZE) ? + remaining : PAGE_SIZE; + + while (j < thispiece) + tmp[j++] = mincore_page(vma, start++); + + if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + error = -EFAULT; + break; + } + } + + free_page((unsigned long) tmp); + return error; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE, + * or len has a nonpositive value + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +asmlinkage long sys_mincore(unsigned long start, size_t len, + unsigned char * vec) +{ + int index = 0; + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + long error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + + if (start & ~PAGE_CACHE_MASK) + goto out; + len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = mincore_vma(vma, start, end, + &vec[index]); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = mincore_vma(vma, start, vma->vm_end, &vec[index]); + if (error) + goto out; + index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline +struct page *__read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page **hash = page_hash(mapping, index); + struct page *page, *cached_page = NULL; + int err; +repeat: + page = __find_get_page(mapping, index, hash); + if (!page) { + if (!cached_page) { + cached_page = page_cache_alloc(mapping); + if (!cached_page) + return ERR_PTR(-ENOMEM); + } + page = cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + cached_page = NULL; + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +/* + * Read into the page cache. If a page already exists, + * and Page_Uptodate() is not set, try to fill the page. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + mark_page_accessed(page); + if (Page_Uptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry; + } + if (Page_Uptodate(page)) { + UnlockPage(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + out: + return page; +} + +static inline struct page * __grab_cache_page(struct address_space *mapping, + unsigned long index, struct page **cached_page) +{ + struct page *page, **hash = page_hash(mapping, index); +repeat: + page = __find_lock_page(mapping, index, hash); + if (!page) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (!*cached_page) + return NULL; + } + page = *cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + *cached_page = NULL; + } + return page; +} + +inline void remove_suid(struct inode *inode) +{ + unsigned int mode; + + /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ + mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; + + /* was any of the uid bits set? */ + mode &= inode->i_mode; + if (mode && !capable(CAP_FSETID)) { + inode->i_mode &= ~mode; + mark_inode_dirty(inode); + } +} + +/* + * precheck_file_write(): + * Check the conditions on a file descriptor prior to beginning a write + * on it. Contains the common precheck code for both buffered and direct + * IO. + */ +int precheck_file_write(struct file *file, struct inode *inode, + size_t *count, loff_t *ppos) +{ + ssize_t err; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos = *ppos; + + err = -EINVAL; + if (pos < 0) + goto out; + + err = file->f_error; + if (err) { + file->f_error = 0; + goto out; + } + + /* FIXME: this is for backwards compatibility with 2.4 */ + if (!S_ISBLK(inode->i_mode) && (file->f_flags & O_APPEND)) + *ppos = pos = inode->i_size; + + /* + * Check whether we've reached the file size limit. + */ + err = -EFBIG; + + if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + *count = limit - (u32)pos; + } + } + + /* + * LFS rule + */ + if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (*count > MAX_NON_LFS - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + *count = MAX_NON_LFS - (u32)pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write + * If we have exceeded without writing data we send + * a signal and give them an EFBIG. + * + * Linus frestrict idea will clean these up nicely.. + */ + + if (!S_ISBLK(inode->i_mode)) { + if (pos >= inode->i_sb->s_maxbytes) + { + if (*count || pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + err = -EFBIG; + goto out; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (pos + *count > inode->i_sb->s_maxbytes) + *count = inode->i_sb->s_maxbytes - pos; + } else { + if (is_read_only(inode->i_rdev)) { + err = -EPERM; + goto out; + } + if (pos >= inode->i_size) { + if (*count || pos > inode->i_size) { + err = -ENOSPC; + goto out; + } + } + + if (pos + *count > inode->i_size) + *count = inode->i_size - pos; + } + + err = 0; +out: + return err; +} + +/* + * Write to a file through the page cache. + * + * We currently put everything into the page cache prior to writing it. + * This is not a problem when writing full pages. With partial pages, + * however, we first have to read the data into the cache, then + * dirty the page, and finally schedule it for writing. Alternatively, we + * could write-through just the portion of data that would go into that + * page, but that would kill performance for applications that write data + * line by line, and it's prone to race conditions. + * + * Note that this routine doesn't try to keep track of dirty pages. Each + * file system has to do this all by itself, unfortunately. + * okir@monad.swb.de + */ +ssize_t +do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + loff_t pos; + struct page *page, *cached_page; + ssize_t written; + long status = 0; + ssize_t err; + unsigned bytes; + + cached_page = NULL; + pos = *ppos; + written = 0; + + err = precheck_file_write(file, inode, &count, &pos); + if (err != 0 || count == 0) + goto out; + + remove_suid(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + + do { + unsigned long index, offset; + long page_fault; + char *kaddr; + + /* + * Try to find the page in the cache. If it isn't there, + * allocate a free page. + */ + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + { volatile unsigned char dummy; + __get_user(dummy, buf); + __get_user(dummy, buf+bytes-1); + } + + status = -ENOMEM; /* we'll assign it later anyway */ + page = __grab_cache_page(mapping, index, &cached_page); + if (!page) + break; + + /* We have exclusive IO access to the page.. */ + if (!PageLocked(page)) { + PAGE_BUG(page); + } + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes); + if (status) + goto sync_failure; + page_fault = __copy_from_user(kaddr+offset, buf, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); + if (page_fault) + goto fail_write; + if (!status) + status = bytes; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + } +unlock: + kunmap(page); + /* Mark it unlocked again and drop the page.. */ + SetPageReferenced(page); + UnlockPage(page); + page_cache_release(page); + + if (status < 0) + break; + } while (count); +done: + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* For now, when the user asks for O_SYNC, we'll actually + * provide O_DSYNC. */ + if (status >= 0) { + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); + } + + err = written ? written : status; +out: + + return err; +fail_write: + status = -EFAULT; + goto unlock; + +sync_failure: + /* + * If blocksize < pagesize, prepare_write() may have instantiated a + * few blocks outside i_size. Trim these off again. + */ + kunmap(page); + UnlockPage(page); + page_cache_release(page); + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); + goto done; +} + +ssize_t +do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + loff_t pos; + ssize_t written; + long status = 0; + ssize_t err; + + pos = *ppos; + written = 0; + + err = precheck_file_write(file, inode, &count, &pos); + if (err != 0 || count == 0) + goto out; + + if (!(file->f_flags & O_DIRECT)) + BUG(); + + remove_suid(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + + written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); + if (written > 0) { + loff_t end = pos + written; + if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { + inode->i_size = end; + mark_inode_dirty(inode); + } + *ppos = end; + invalidate_inode_pages2(mapping); + } + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (written >= 0 && (file->f_flags & O_SYNC)) + status = generic_osync_inode(inode, OSYNC_METADATA); + + err = written ? written : status; +out: + return err; +} + +static int do_odirect_fallback(struct file *file, struct inode *inode, + const char *buf, size_t count, loff_t *ppos) +{ + ssize_t ret; + int err; + + down(&inode->i_sem); + ret = do_generic_file_write(file, buf, count, ppos); + if (ret > 0) { + err = do_fdatasync(file); + if (err) + ret = err; + } + up(&inode->i_sem); + return ret; +} + +ssize_t +generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + ssize_t err; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + if (file->f_flags & O_DIRECT) { + /* do_generic_direct_write may drop i_sem during the + actual IO */ + down_read(&inode->i_alloc_sem); + down(&inode->i_sem); + err = do_generic_direct_write(file, buf, count, ppos); + up(&inode->i_sem); + up_read(&inode->i_alloc_sem); + if (unlikely(err == -ENOTBLK)) + err = do_odirect_fallback(file, inode, buf, count, ppos); + } else { + down(&inode->i_sem); + err = do_generic_file_write(file, buf, count, ppos); + up(&inode->i_sem); + } + + return err; +} + +void __init page_cache_init(unsigned long mempages) +{ + unsigned long htable_size, order; + + htable_size = mempages; + htable_size *= sizeof(struct page *); + for(order = 0; (PAGE_SIZE << order) < htable_size; order++) + ; + + do { + unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *); + + page_hash_bits = 0; + while((tmp >>= 1UL) != 0UL) + page_hash_bits++; + + page_hash_table = (struct page **) + __get_free_pages(GFP_ATOMIC, order); + } while(page_hash_table == NULL && --order > 0); + + printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n", + (1 << page_hash_bits), order, (PAGE_SIZE << order)); + if (!page_hash_table) + panic("Failed to allocate page hash table\n"); + memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); +} diff --git a/uClinux-2.4.31-uc0/mm/highmem.c b/uClinux-2.4.31-uc0/mm/highmem.c new file mode 100644 index 0000000..e739c06 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/highmem.c @@ -0,0 +1,454 @@ +/* + * High memory handling common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * 64-bit physical space. With current x86 CPUs this + * means up to 64 Gigabytes physical RAM. + * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/swap.h> +#include <linux/slab.h> + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +static int pkmap_count[LAST_PKMAP]; +static unsigned int last_pkmap_nr; +static spinlock_cacheline_t kmap_lock_cacheline = {SPIN_LOCK_UNLOCKED}; +#define kmap_lock kmap_lock_cacheline.lock + +pte_t * pkmap_page_table; + +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + +static void flush_all_zero_pkmaps(void) +{ + int i; + + flush_cache_all(); + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + + /* sanity check */ + if (pte_none(pkmap_page_table[i])) + BUG(); + + /* + * Don't need an atomic fetch-and-clear op here; + * no-one has the page mapped, and cannot get at + * its virtual address (and hence PTE) without first + * getting the kmap_lock (which is held here). + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); + pte_clear(&pkmap_page_table[i]); + + page->virtual = NULL; + } + flush_tlb_all(); +} + +static inline unsigned long map_new_virtual(struct page *page, int nonblocking) +{ + unsigned long vaddr; + int count; + +start: + count = LAST_PKMAP; + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + if (!last_pkmap_nr) { + flush_all_zero_pkmaps(); + count = LAST_PKMAP; + } + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + if (nonblocking) + return 0; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + + current->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&pkmap_map_wait, &wait); + spin_unlock(&kmap_lock); + schedule(); + remove_wait_queue(&pkmap_map_wait, &wait); + spin_lock(&kmap_lock); + + /* Somebody else might have mapped it while we slept */ + if (page->virtual) + return (unsigned long) page->virtual; + + /* Re-start */ + goto start; + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + + pkmap_count[last_pkmap_nr] = 1; + page->virtual = (void *) vaddr; + + return vaddr; +} + +void fastcall *kmap_high(struct page *page, int nonblocking) +{ + unsigned long vaddr; + + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + * + * We cannot call this from interrupts, as it may block + */ + spin_lock(&kmap_lock); + vaddr = (unsigned long) page->virtual; + if (!vaddr) { + vaddr = map_new_virtual(page, nonblocking); + if (!vaddr) + goto out; + } + pkmap_count[PKMAP_NR(vaddr)]++; + if (pkmap_count[PKMAP_NR(vaddr)] < 2) + BUG(); + out: + spin_unlock(&kmap_lock); + return (void*) vaddr; +} + +void fastcall kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + int need_wakeup; + + spin_lock(&kmap_lock); + vaddr = (unsigned long) page->virtual; + if (!vaddr) + BUG(); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + need_wakeup = 0; + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + /* + * Avoid an unnecessary wake_up() function call. + * The common case is pkmap_count[] == 1, but + * no waiters. + * The tasks queued in the wait-queue are guarded + * by both the lock in the wait-queue-head and by + * the kmap_lock. As the kmap_lock is held here, + * no need for the wait-queue-head's lock. Simply + * test if the queue is empty. + */ + need_wakeup = waitqueue_active(&pkmap_map_wait); + } + spin_unlock(&kmap_lock); + + /* do wake-up, if needed, race-free outside of the spin lock */ + if (need_wakeup) + wake_up(&pkmap_map_wait); +} + +#define POOL_SIZE 32 + +/* + * This lock gets no contention at all, normally. + */ +static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED; + +int nr_emergency_pages; +static LIST_HEAD(emergency_pages); + +int nr_emergency_bhs; +static LIST_HEAD(emergency_bhs); + +/* + * Simple bounce buffer support for highmem pages. + * This will be moved to the block layer in 2.5. + */ + +static inline void copy_from_high_bh (struct buffer_head *to, + struct buffer_head *from) +{ + struct page *p_from; + char *vfrom; + + p_from = from->b_page; + + vfrom = kmap_atomic(p_from, KM_USER0); + memcpy(to->b_data, vfrom + bh_offset(from), to->b_size); + kunmap_atomic(vfrom, KM_USER0); +} + +static inline void copy_to_high_bh_irq (struct buffer_head *to, + struct buffer_head *from) +{ + struct page *p_to; + char *vto; + unsigned long flags; + + p_to = to->b_page; + __save_flags(flags); + __cli(); + vto = kmap_atomic(p_to, KM_BOUNCE_READ); + memcpy(vto + bh_offset(to), from->b_data, to->b_size); + kunmap_atomic(vto, KM_BOUNCE_READ); + __restore_flags(flags); +} + +static inline void bounce_end_io (struct buffer_head *bh, int uptodate) +{ + struct page *page; + struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); + unsigned long flags; + + bh_orig->b_end_io(bh_orig, uptodate); + + page = bh->b_page; + + spin_lock_irqsave(&emergency_lock, flags); + if (nr_emergency_pages >= POOL_SIZE) + __free_page(page); + else { + /* + * We are abusing page->list to manage + * the highmem emergency pool: + */ + list_add(&page->list, &emergency_pages); + nr_emergency_pages++; + } + + if (nr_emergency_bhs >= POOL_SIZE) { +#ifdef HIGHMEM_DEBUG + /* Don't clobber the constructed slab cache */ + init_waitqueue_head(&bh->b_wait); +#endif + kmem_cache_free(bh_cachep, bh); + } else { + /* + * Ditto in the bh case, here we abuse b_inode_buffers: + */ + list_add(&bh->b_inode_buffers, &emergency_bhs); + nr_emergency_bhs++; + } + spin_unlock_irqrestore(&emergency_lock, flags); +} + +static __init int init_emergency_pool(void) +{ + struct sysinfo i; + si_meminfo(&i); + si_swapinfo(&i); + + if (!i.totalhigh) + return 0; + + spin_lock_irq(&emergency_lock); + while (nr_emergency_pages < POOL_SIZE) { + struct page * page = alloc_page(GFP_ATOMIC); + if (!page) { + printk("couldn't refill highmem emergency pages"); + break; + } + list_add(&page->list, &emergency_pages); + nr_emergency_pages++; + } + while (nr_emergency_bhs < POOL_SIZE) { + struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC); + if (!bh) { + printk("couldn't refill highmem emergency bhs"); + break; + } + list_add(&bh->b_inode_buffers, &emergency_bhs); + nr_emergency_bhs++; + } + spin_unlock_irq(&emergency_lock); + printk("allocated %d pages and %d bhs reserved for the highmem bounces\n", + nr_emergency_pages, nr_emergency_bhs); + + return 0; +} + +__initcall(init_emergency_pool); + +static void bounce_end_io_write (struct buffer_head *bh, int uptodate) +{ + bounce_end_io(bh, uptodate); +} + +static void bounce_end_io_read (struct buffer_head *bh, int uptodate) +{ + struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); + + if (uptodate) + copy_to_high_bh_irq(bh_orig, bh); + bounce_end_io(bh, uptodate); +} + +struct page *alloc_bounce_page (void) +{ + struct list_head *tmp; + struct page *page; + + page = alloc_page(GFP_NOHIGHIO); + if (page) + return page; + /* + * No luck. First, kick the VM so it doesn't idle around while + * we are using up our emergency rations. + */ + wakeup_bdflush(); + +repeat_alloc: + /* + * Try to allocate from the emergency pool. + */ + tmp = &emergency_pages; + spin_lock_irq(&emergency_lock); + if (!list_empty(tmp)) { + page = list_entry(tmp->next, struct page, list); + list_del(tmp->next); + nr_emergency_pages--; + } + spin_unlock_irq(&emergency_lock); + if (page) + return page; + + /* we need to wait I/O completion */ + run_task_queue(&tq_disk); + + yield(); + goto repeat_alloc; +} + +struct buffer_head *alloc_bounce_bh (void) +{ + struct list_head *tmp; + struct buffer_head *bh; + + bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO); + if (bh) + return bh; + /* + * No luck. First, kick the VM so it doesn't idle around while + * we are using up our emergency rations. + */ + wakeup_bdflush(); + +repeat_alloc: + /* + * Try to allocate from the emergency pool. + */ + tmp = &emergency_bhs; + spin_lock_irq(&emergency_lock); + if (!list_empty(tmp)) { + bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers); + list_del(tmp->next); + nr_emergency_bhs--; + } + spin_unlock_irq(&emergency_lock); + if (bh) + return bh; + + /* we need to wait I/O completion */ + run_task_queue(&tq_disk); + + yield(); + goto repeat_alloc; +} + +struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig) +{ + struct page *page; + struct buffer_head *bh; + + if (!PageHighMem(bh_orig->b_page)) + return bh_orig; + + bh = alloc_bounce_bh(); + /* + * This is wasteful for 1k buffers, but this is a stopgap measure + * and we are being ineffective anyway. This approach simplifies + * things immensly. On boxes with more than 4GB RAM this should + * not be an issue anyway. + */ + page = alloc_bounce_page(); + + set_bh_page(bh, page, 0); + + bh->b_next = NULL; + bh->b_blocknr = bh_orig->b_blocknr; + bh->b_size = bh_orig->b_size; + bh->b_list = -1; + bh->b_dev = bh_orig->b_dev; + bh->b_count = bh_orig->b_count; + bh->b_rdev = bh_orig->b_rdev; + bh->b_state = bh_orig->b_state; +#ifdef HIGHMEM_DEBUG + bh->b_flushtime = jiffies; + bh->b_next_free = NULL; + bh->b_prev_free = NULL; + /* bh->b_this_page */ + bh->b_reqnext = NULL; + bh->b_pprev = NULL; +#endif + /* bh->b_page */ + if (rw == WRITE) { + bh->b_end_io = bounce_end_io_write; + copy_from_high_bh(bh, bh_orig); + } else + bh->b_end_io = bounce_end_io_read; + bh->b_private = (void *)bh_orig; + bh->b_rsector = bh_orig->b_rsector; +#ifdef HIGHMEM_DEBUG + memset(&bh->b_wait, -1, sizeof(bh->b_wait)); +#endif + + return bh; +} + diff --git a/uClinux-2.4.31-uc0/mm/memory.c b/uClinux-2.4.31-uc0/mm/memory.c new file mode 100644 index 0000000..ed6f491 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/memory.c @@ -0,0 +1,1504 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + */ + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/swapctl.h> +#include <linux/iobuf.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/module.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> + +unsigned long max_mapnr; +unsigned long num_physpages; +unsigned long num_mappedpages; +void * high_memory; +struct page *highmem_start_page; + +/* + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). + */ +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +{ + if (from == ZERO_PAGE(address)) { + clear_user_highpage(to, address); + return; + } + copy_user_highpage(to, from, address); +} + +mem_map_t * mem_map; + +/* + * Called by TLB shootdown + */ +void __free_pte(pte_t pte) +{ + struct page *page = pte_page(pte); + if ((!VALID_PAGE(page)) || PageReserved(page)) + return; + if (pte_dirty(pte)) + set_page_dirty(page); + free_page_and_swap_cache(page); +} + + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static inline void free_one_pmd(pmd_t * dir) +{ + pte_t * pte; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, 0); + pmd_clear(dir); + pte_free(pte); +} + +static inline void free_one_pgd(pgd_t * dir) +{ + int j; + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, 0); + pgd_clear(dir); + for (j = 0; j < PTRS_PER_PMD ; j++) { + prefetchw(pmd+j+(PREFETCH_STRIDE/16)); + free_one_pmd(pmd+j); + } + pmd_free(pmd); +} + +/* Low and high watermarks for page table cache. + The system should try to have pgt_water[0] <= cache elements <= pgt_water[1] + */ +int pgt_cache_water[2] = { 25, 50 }; + +/* Returns the number of pages freed */ +int check_pgt_cache(void) +{ + return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); +} + + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. + */ +void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) +{ + pgd_t * page_dir = mm->pgd; + + spin_lock(&mm->page_table_lock); + page_dir += first; + do { + free_one_pgd(page_dir); + page_dir++; + } while (--nr); + spin_unlock(&mm->page_table_lock); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +} + +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + * + * 08Jan98 Merged into one routine from several inline routines to reduce + * variable count and make things faster. -jj + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within pmd_alloc() and pte_alloc(). + */ +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t * src_pgd, * dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + /* copy_pmd_range */ + + if (pgd_none(*src_pgd)) + goto skip_copy_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + do { + pte_t * src_pte, * dst_pte; + + /* copy_pte_range */ + + if (pmd_none(*src_pmd)) + goto skip_copy_pte_range; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); +skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + + src_pte = pte_offset(src_pmd, address); + dst_pte = pte_alloc(dst, dst_pmd, address); + if (!dst_pte) + goto nomem; + + spin_lock(&src->page_table_lock); + do { + pte_t pte = *src_pte; + struct page *ptepage; + + /* copy_one_pte */ + + if (pte_none(pte)) + goto cont_copy_pte_range_noset; + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + goto cont_copy_pte_range; + } + ptepage = pte_page(pte); + if ((!VALID_PAGE(ptepage)) || + PageReserved(ptepage)) + goto cont_copy_pte_range; + + /* If it's a COW mapping, write protect it both in the parent and the child */ + if (cow && pte_write(pte)) { + ptep_set_wrprotect(src_pte); + pte = *src_pte; + } + + /* If it's a shared mapping, mark it clean in the child */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(ptepage); + dst->rss++; + +cont_copy_pte_range: set_pte(dst_pte, pte); +cont_copy_pte_range_noset: address += PAGE_SIZE; + if (address >= end) + goto out_unlock; + src_pte++; + dst_pte++; + } while ((unsigned long)src_pte & PTE_TABLE_MASK); + spin_unlock(&src->page_table_lock); + +cont_copy_pmd_range: src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + } +out_unlock: + spin_unlock(&src->page_table_lock); +out: + return 0; +nomem: + return -ENOMEM; +} + +/* + * Return indicates whether a page was freed so caller can adjust rss + */ +static inline void forget_pte(pte_t page) +{ + if (!pte_none(page)) { + printk("forget_pte: old mapping existed!\n"); + BUG(); + } +} + +static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) +{ + unsigned long offset; + pte_t * ptep; + int freed = 0; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + ptep = pte_offset(pmd, address); + offset = address & ~PMD_MASK; + if (offset + size > PMD_SIZE) + size = PMD_SIZE - offset; + size &= PAGE_MASK; + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + pte_t pte = *ptep; + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page)) + freed ++; + /* This will eventually call __free_pte on the pte. */ + tlb_remove_page(tlb, ptep, address + offset); + } else { + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(ptep); + } + } + + return freed; +} + +static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + int freed; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return 0; + } + pmd = pmd_offset(dir, address); + end = address + size; + if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + end = ((address + PGDIR_SIZE) & PGDIR_MASK); + freed = 0; + do { + freed += zap_pte_range(tlb, pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return freed; +} + +/* + * remove user pages in a given range. + */ +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +{ + mmu_gather_t *tlb; + pgd_t * dir; + unsigned long start = address, end = address + size; + int freed = 0; + + dir = pgd_offset(mm, address); + + /* + * This is a long-lived spinlock. That's fine. + * There's no contention, because the page table + * lock only protects against kswapd anyway, and + * even if kswapd happened to be looking at this + * process we _want_ it to get stuck. + */ + if (address >= end) + BUG(); + spin_lock(&mm->page_table_lock); + flush_cache_range(mm, address, end); + tlb = tlb_gather_mmu(mm); + + do { + freed += zap_pmd_range(tlb, dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + + /* this will flush any remaining tlb entries */ + tlb_finish_mmu(tlb, start, end); + + /* + * Update rss for the mm_struct (not necessarily current->mm) + * Notice that rss is an unsigned long. + */ + if (mm->rss > freed) + mm->rss -= freed; + else + mm->rss = 0; + spin_unlock(&mm->page_table_lock); +} + +/* + * Do a quick page-table lookup for a single page. + */ +struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + + ptep = pte_offset(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + if (pte_present(pte)) { + if (!write || + (pte_write(pte) && pte_dirty(pte))) + return pte_page(pte); + } + +out: + return 0; +} + +/* + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages in kiobufs. + */ + +static inline struct page * get_page_map(struct page *page) +{ + if (!VALID_PAGE(page)) + return 0; + return page; +} + +/* + * Please read Documentation/cachetlb.txt before using this function, + * accessing foreign memory spaces can cause cache coherency problems. + * + * Accessing a VM_IO area is even more dangerous, therefore the function + * fails if pages is != NULL and a VM_IO area is found. + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int flags; + + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct * vma; + + vma = find_extend_vma(mm, start); + + if ( !vma || (pages && vma->vm_flags & VM_IO) || !(flags & vma->vm_flags) ) + return i ? : -EFAULT; + + spin_lock(&mm->page_table_lock); + do { + struct page *map; + while (!(map = follow_page(mm, start, write))) { + spin_unlock(&mm->page_table_lock); + switch (handle_mm_fault(mm, vma, start, write)) { + case 1: + tsk->min_flt++; + break; + case 2: + tsk->maj_flt++; + break; + case 0: + if (i) return i; + return -EFAULT; + default: + if (i) return i; + return -ENOMEM; + } + spin_lock(&mm->page_table_lock); + } + if (pages) { + pages[i] = get_page_map(map); + /* FIXME: call the correct function, + * depending on the type of the found page + */ + if (!pages[i] || PageReserved(pages[i])) { + if (pages[i] != ZERO_PAGE(start)) + goto bad_page; + } else + page_cache_get(pages[i]); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while(len && start < vma->vm_end); + spin_unlock(&mm->page_table_lock); + } while(len); +out: + return i; + + /* + * We found an invalid page in the VMA. Release all we have + * so far and fail. + */ +bad_page: + spin_unlock(&mm->page_table_lock); + while (i--) + page_cache_release(pages[i]); + i = -EFAULT; + goto out; +} + +EXPORT_SYMBOL(get_user_pages); + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + */ +#define dprintk(x...) + +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) +{ + int pgcount, err; + struct mm_struct * mm; + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE; + /* mapping 0 bytes is not permitted */ + if (!pgcount) BUG(); + err = expand_kiobuf(iobuf, pgcount); + if (err) + return err; + + iobuf->locked = 0; + iobuf->offset = va & (PAGE_SIZE-1); + iobuf->length = len; + + /* Try to fault in all of the necessary pages */ + down_read(&mm->mmap_sem); + /* rw==READ means read from disk, write into memory area */ + err = get_user_pages(current, mm, va, pgcount, + (rw==READ), 0, iobuf->maplist, NULL); + up_read(&mm->mmap_sem); + if (err < 0) { + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; + } + iobuf->nr_pages = err; + while (pgcount--) { + /* FIXME: flush superflous for rw==READ, + * probably wrong function for rw==WRITE + */ + flush_dcache_page(iobuf->maplist[pgcount]); + } + dprintk ("map_user_kiobuf: end OK\n"); + return 0; +} + +/* + * Mark all of the pages in a kiobuf as dirty + * + * We need to be able to deal with short reads from disk: if an IO error + * occurs, the number of bytes read into memory may be less than the + * size of the kiobuf, so we have to stop marking pages dirty once the + * requested byte count has been reached. + * + * Must be called from process context - set_page_dirty() takes VFS locks. + */ + +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + + index = iobuf->offset >> PAGE_SHIFT; + offset = iobuf->offset & ~PAGE_MASK; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + + if (!PageReserved(page)) + set_page_dirty(page); + + remaining -= (PAGE_SIZE - offset); + offset = 0; + index++; + } +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kiobuf (struct kiobuf *iobuf) +{ + int i; + struct page *map; + + for (i = 0; i < iobuf->nr_pages; i++) { + map = iobuf->maplist[i]; + if (map) { + if (iobuf->locked) + UnlockPage(map); + /* FIXME: cache flush missing for rw==READ + * FIXME: call the correct reference counting function + */ + page_cache_release(map); + } + } + + iobuf->nr_pages = 0; + iobuf->locked = 0; +} + + +/* + * Lock down all of the pages of a kiovec for IO. + * + * If any page is mapped twice in the kiovec, we return the error -EINVAL. + * + * The optional wait parameter causes the lock call to block until all + * pages can be locked if set. If wait==0, the lock operation is + * aborted if any locked pages are found and -EAGAIN is returned. + */ + +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + int doublepage = 0; + int repeat = 0; + + repeat: + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (iobuf->locked) + continue; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + + if (TryLockPage(page)) { + while (j--) { + struct page *tmp = *--ppage; + if (tmp) + UnlockPage(tmp); + } + goto retry; + } + } + iobuf->locked = 1; + } + + return 0; + + retry: + + /* + * We couldn't lock one of the pages. Undo the locking so far, + * wait on the page we got to, and try again. + */ + + unlock_kiovec(nr, iovec); + if (!wait) + return -EAGAIN; + + /* + * Did the release also unlock the page we got stuck on? + */ + if (!PageLocked(page)) { + /* + * If so, we may well have the page mapped twice + * in the IO address range. Bad news. Of + * course, it _might_ just be a coincidence, + * but if it happens more than once, chances + * are we have a double-mapped page. + */ + if (++doublepage >= 3) + return -EINVAL; + + /* Try again... */ + wait_on_page(page); + } + + if (++repeat < 16) + goto repeat; + return -EAGAIN; +} + +/* + * Unlock all of the pages of a kiovec after IO. + */ + +int unlock_kiovec(int nr, struct kiobuf *iovec[]) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (!iobuf->locked) + continue; + iobuf->locked = 0; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + UnlockPage(page); + } + } + return 0; +} + +static inline void zeromap_pte_range(pte_t * pte, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + pte_t oldpage = ptep_get_and_clear(pte); + set_pte(pte, zero_pte); + forget_pte(oldpage); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + zeromap_pte_range(pte, address, end - address, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = address; + unsigned long end = address + size; + struct mm_struct *mm = current->mm; + + dir = pgd_offset(mm, address); + flush_cache_range(mm, beg, end); + if (address >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + struct page *page; + pte_t oldpage; + oldpage = ptep_get_and_clear(pte); + + page = virt_to_page(__va(phys_addr)); + if ((!VALID_PAGE(page)) || PageReserved(page)) + set_pte(pte, mk_pte_phys(phys_addr, prot)); + forget_pte(oldpage); + address += PAGE_SIZE; + phys_addr += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + phys_addr -= address; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + remap_pte_range(pte, address, end - address, address + phys_addr, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = from; + unsigned long end = from + size; + struct mm_struct *mm = current->mm; + + phys_addr -= from; + dir = pgd_offset(mm, from); + flush_cache_range(mm, beg, end); + if (from >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, from); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + if (error) + break; + from = (from + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (from && (from < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * Establish a new mapping: + * - flush the old one + * - update the page tables + * - inform the TLB about the new one + * + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +{ + set_pte(page_table, entry); + flush_tlb_page(vma, address); + update_mmu_cache(vma, address, entry); +} + +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + flush_page_to_ram(new_page); + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pte_t pte) +{ + struct page *old_page, *new_page; + + old_page = pte_page(pte); + if (!VALID_PAGE(old_page)) + goto bad_wp_page; + + if (!TryLockPage(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + } + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); + + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + goto no_mem; + copy_cow_page(old_page,new_page,address); + + /* + * Re-check the pte - we dropped the lock + */ + spin_lock(&mm->page_table_lock); + if (pte_same(*page_table, pte)) { + if (PageReserved(old_page)) + ++mm->rss; + break_cow(vma, new_page, address, page_table); + if (vm_anon_lru) + lru_cache_add(new_page); + + /* Free the old page.. */ + new_page = old_page; + } + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + page_cache_release(old_page); + return 1; /* Minor fault */ + +bad_wp_page: + spin_unlock(&mm->page_table_lock); + printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); + return -1; +no_mem: + page_cache_release(old_page); + return -1; +} + +static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) +{ + do { + struct mm_struct *mm = mpnt->vm_mm; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long len = end - start; + unsigned long diff; + + /* mapping wholly truncated? */ + if (mpnt->vm_pgoff >= pgoff) { + zap_page_range(mm, start, len); + continue; + } + + /* mapping wholly unaffected? */ + len = len >> PAGE_SHIFT; + diff = pgoff - mpnt->vm_pgoff; + if (diff >= len) + continue; + + /* Ok, partially affected.. */ + start += diff << PAGE_SHIFT; + len = (len - diff) << PAGE_SHIFT; + zap_page_range(mm, start, len); + } while ((mpnt = mpnt->vm_next_share) != NULL); +} + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + unsigned long pgoff; + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + inode->i_size = offset; + spin_lock(&mapping->i_shared_lock); + if (!mapping->i_mmap && !mapping->i_mmap_shared) + goto out_unlock; + + pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (mapping->i_mmap != NULL) + vmtruncate_list(mapping->i_mmap, pgoff); + if (mapping->i_mmap_shared != NULL) + vmtruncate_list(mapping->i_mmap_shared, pgoff); + +out_unlock: + spin_unlock(&mapping->i_shared_lock); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out; + inode->i_size = offset; + +out_truncate: + if (inode->i_op && inode->i_op->truncate) { + lock_kernel(); + inode->i_op->truncate(inode); + unlock_kernel(); + } + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out: + return -EFBIG; +} + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(swp_entry_t entry) +{ + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); + if (!new_page) + break; + page_cache_release(new_page); + } + return; +} + +/* + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. + */ +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t * page_table, pte_t orig_pte, int write_access) +{ + struct page *page; + swp_entry_t entry = pte_to_swp_entry(orig_pte); + pte_t pte; + int ret = 1; + + spin_unlock(&mm->page_table_lock); + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Back out if somebody else faulted in this pte while + * we released the page table lock. + */ + int retval; + spin_lock(&mm->page_table_lock); + retval = pte_same(*page_table, orig_pte) ? -1 : 1; + spin_unlock(&mm->page_table_lock); + return retval; + } + + /* Had to read the page from swap area: Major fault */ + ret = 2; + } + + mark_page_accessed(page); + + lock_page(page); + + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. + */ + spin_lock(&mm->page_table_lock); + if (!pte_same(*page_table, orig_pte)) { + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + return 1; + } + + /* The page isn't present yet, go ahead with the fault. */ + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + + mm->rss++; + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) + pte = pte_mkdirty(pte_mkwrite(pte)); + unlock_page(page); + + flush_page_to_ram(page); + flush_icache_page(vma, page); + set_pte(page_table, pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + spin_unlock(&mm->page_table_lock); + return ret; +} + +/* + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. + */ +static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) +{ + pte_t entry; + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + + /* ..except if it's a write access */ + if (write_access) { + struct page *page; + + /* Allocate our own private page. */ + spin_unlock(&mm->page_table_lock); + + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto no_mem; + clear_user_highpage(page, addr); + + spin_lock(&mm->page_table_lock); + if (!pte_none(*page_table)) { + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + return 1; + } + mm->rss++; + flush_page_to_ram(page); + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + if (vm_anon_lru) + lru_cache_add(page); + mark_page_accessed(page); + } + + set_pte(page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + +no_mem: + return -1; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. + */ +static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table) +{ + struct page * new_page; + pte_t entry; + + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, write_access, address); + spin_unlock(&mm->page_table_lock); + + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + + if (new_page == NULL) /* no page was available -- SIGBUS */ + return 0; + if (new_page == NOPAGE_OOM) + return -1; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page * page = alloc_page(GFP_HIGHUSER); + if (!page) { + page_cache_release(new_page); + return -1; + } + copy_user_highpage(page, new_page, address); + page_cache_release(new_page); + if (vm_anon_lru) + lru_cache_add(page); + new_page = page; + } + + spin_lock(&mm->page_table_lock); + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + if (!PageReserved(new_page)) + ++mm->rss; + flush_page_to_ram(new_page); + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = pte_mkwrite(pte_mkdirty(entry)); + set_pte(page_table, entry); + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + return 1; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + spin_unlock(&mm->page_table_lock); + return 2; /* Major fault */ +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t * pte) +{ + pte_t entry; + + entry = *pte; + if (!pte_present(entry)) { + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte); + return do_swap_page(mm, vma, address, pte, entry, write_access); + } + + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, pte, entry); + + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + establish_pte(vma, address, pte, entry); + spin_unlock(&mm->page_table_lock); + return 1; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pmd_t *pmd; + + current->state = TASK_RUNNING; + pgd = pgd_offset(mm, address); + + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); + + if (pmd) { + pte_t * pte = pte_alloc(mm, pmd, address); + if (pte) + return handle_pte_fault(mm, vma, address, write_access, pte); + } + spin_unlock(&mm->page_table_lock); + return -1; +} + +/* + * Allocate page middle directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level page table, this ends up actually being entirely + * optimized away. + */ +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pmd_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pgd_none(*pgd)) { + pmd_free(new); + check_pgt_cache(); + goto out; + } + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset(pgd, address); +} + +/* + * Allocate the page table directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + */ +pte_t fastcall *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (pmd_none(*pmd)) { + pte_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pte_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pmd_none(*pmd)) { + pte_free(new); + check_pgt_cache(); + goto out; + } + } + pmd_populate(mm, pmd, new); + } +out: + return pte_offset(pmd, address); +} + +int make_pages_present(unsigned long addr, unsigned long end) +{ + int ret, len, write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + if (end > vma->vm_end) + BUG(); + len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + ret = get_user_pages(current, current->mm, addr, + len, write, 0, NULL, NULL); + return ret == len ? 0 : -1; +} + +struct page * vmalloc_to_page(void * vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pmd_t *pmd; + pte_t *pte; + pgd_t *pgd; + + pgd = pgd_offset_k(addr); + if (!pgd_none(*pgd)) { + pmd = pmd_offset(pgd, addr); + if (!pmd_none(*pmd)) { + pte = pte_offset(pmd, addr); + if (pte_present(*pte)) { + page = pte_page(*pte); + } + } + } + return page; +} diff --git a/uClinux-2.4.31-uc0/mm/mlock.c b/uClinux-2.4.31-uc0/mm/mlock.c new file mode 100644 index 0000000..3524645 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/mlock.c @@ -0,0 +1,301 @@ +/* + * linux/mm/mlock.c + * + * (C) Copyright 1995 Linus Torvalds + */ +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/pagemap.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags) +{ + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_flags = newflags; + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; +} + +static inline int mlock_fixup_start(struct vm_area_struct * vma, + unsigned long end, int newflags) +{ + struct vm_area_struct * n; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_end = end; + n->vm_flags = newflags; + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_start = end; + __insert_vm_struct(current->mm, n); + spin_unlock(&vma->vm_mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static inline int mlock_fixup_end(struct vm_area_struct * vma, + unsigned long start, int newflags) +{ + struct vm_area_struct * n; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + n->vm_flags = newflags; + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + lock_vma_mappings(vma); + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_end = start; + __insert_vm_struct(current->mm, n); + spin_unlock(&vma->vm_mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static inline int mlock_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int newflags) +{ + struct vm_area_struct * left, * right; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -EAGAIN; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + vma->vm_flags = newflags; + left->vm_raend = 0; + right->vm_raend = 0; + if (vma->vm_file) + atomic_add(2, &vma->vm_file->f_count); + + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vma->vm_raend = 0; + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_flags = newflags; + __insert_vm_struct(current->mm, left); + __insert_vm_struct(current->mm, right); + spin_unlock(&vma->vm_mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static int mlock_fixup(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned int newflags) +{ + int pages, retval; + + if (newflags == vma->vm_flags) + return 0; + + if (start == vma->vm_start) { + if (end == vma->vm_end) + retval = mlock_fixup_all(vma, newflags); + else + retval = mlock_fixup_start(vma, end, newflags); + } else { + if (end == vma->vm_end) + retval = mlock_fixup_end(vma, start, newflags); + else + retval = mlock_fixup_middle(vma, start, end, newflags); + } + if (!retval) { + /* keep track of amount of locked VM */ + pages = (end - start) >> PAGE_SHIFT; + if (newflags & VM_LOCKED) { + pages = -pages; + make_pages_present(start, end); + } + vma->vm_mm->locked_vm -= pages; + } + return retval; +} + +static int do_mlock(unsigned long start, size_t len, int on) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * next; + int error; + + if (on && !capable(CAP_IPC_LOCK)) + return -EPERM; + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start) + return -ENOMEM; + + for (nstart = start ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vma->vm_flags | VM_LOCKED; + if (!on) + newflags &= ~VM_LOCKED; + + if (vma->vm_end >= end) { + error = mlock_fixup(vma, nstart, end, newflags); + break; + } + + tmp = vma->vm_end; + next = vma->vm_next; + error = mlock_fixup(vma, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + vma = next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + } + return error; +} + +asmlinkage long sys_mlock(unsigned long start, size_t len) +{ + unsigned long locked; + unsigned long lock_limit; + int error = -ENOMEM; + + down_write(¤t->mm->mmap_sem); + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + + locked = len >> PAGE_SHIFT; + locked += current->mm->locked_vm; + + lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + /* check against resource limits */ + if (locked > lock_limit) + goto out; + + /* we may lock at most half of physical memory... */ + /* (this check is pretty bogus, but doesn't hurt) */ + if (locked > num_physpages/2) + goto out; + + error = do_mlock(start, len, 1); +out: + up_write(¤t->mm->mmap_sem); + return error; +} + +asmlinkage long sys_munlock(unsigned long start, size_t len) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + ret = do_mlock(start, len, 0); + up_write(¤t->mm->mmap_sem); + return ret; +} + +static int do_mlockall(int flags) +{ + int error; + unsigned int def_flags; + struct vm_area_struct * vma; + + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + + def_flags = 0; + if (flags & MCL_FUTURE) + def_flags = VM_LOCKED; + current->mm->def_flags = def_flags; + + error = 0; + for (vma = current->mm->mmap; vma ; vma = vma->vm_next) { + unsigned int newflags; + + newflags = vma->vm_flags | VM_LOCKED; + if (!(flags & MCL_CURRENT)) + newflags &= ~VM_LOCKED; + error = mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags); + if (error) + break; + } + return error; +} + +asmlinkage long sys_mlockall(int flags) +{ + unsigned long lock_limit; + int ret = -EINVAL; + + down_write(¤t->mm->mmap_sem); + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + goto out; + + lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + ret = -ENOMEM; + if (current->mm->total_vm > lock_limit) + goto out; + + /* we may lock at most half of physical memory... */ + /* (this check is pretty bogus, but doesn't hurt) */ + if (current->mm->total_vm > num_physpages/2) + goto out; + + ret = do_mlockall(flags); +out: + up_write(¤t->mm->mmap_sem); + return ret; +} + +asmlinkage long sys_munlockall(void) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mlockall(0); + up_write(¤t->mm->mmap_sem); + return ret; +} diff --git a/uClinux-2.4.31-uc0/mm/mmap.c b/uClinux-2.4.31-uc0/mm/mmap.c new file mode 100644 index 0000000..e158d87 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/mmap.c @@ -0,0 +1,1256 @@ +/* + * linux/mm/mmap.c + * + * Modifications for TLB sharing in Linux (ARM/IA-64) (c) 2001 Adam Wiggins + * + * Written by obz. + */ +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/personality.h> +#include <linux/mount.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +int sysctl_overcommit_memory; +int max_map_count = DEFAULT_MAX_MAP_COUNT; + +/* Check that a process has enough memory to allocate a + * new virtual mapping. + */ +int vm_enough_memory(long pages) +{ + /* Stupid algorithm to decide if we have enough memory: while + * simple, it hopefully works in most obvious cases.. Easy to + * fool it, but this should catch most mistakes. + */ + /* 23/11/98 NJC: Somewhat less stupid version of algorithm, + * which tries to do "TheRightThing". Instead of using half of + * (buffers+cache), use the minimum values. Allow an extra 2% + * of num_physpages for safety margin. + */ + + unsigned long free; + + /* Sometimes we want to use more memory than we have. */ + if (sysctl_overcommit_memory) + return 1; + + /* The page cache contains buffer pages these days.. */ + free = page_cache_size; + free += nr_free_pages(); + free += nr_swap_pages; + + /* + * This double-counts: the nrpages are both in the page-cache + * and in the swapper space. At the same time, this compensates + * for the swap-space over-allocation (ie "nr_swap_pages" being + * too small. + */ + free += swapper_space.nrpages; + + /* + * The code below doesn't account for free space in the inode + * and dentry slab cache, slab cache fragmentation, inodes and + * dentries which will become freeable under VM load, etc. + * Lets just hope all these (complex) factors balance out... + */ + free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; + free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + + return free > pages; +} + + +#ifndef HAVE_ARCH_VM_SHARING_DATA + +/* Dummy function for default */ +static inline void +arch_remove_shared_vm_struct(struct vm_area_struct* vma_p){} + +#else + +extern void arch_remove_shared_vm_struct(struct vm_area_struct* vma_p); + +#endif + +/* Remove one vm structure from the inode's i_mapping address space. */ +static inline void __remove_shared_vm_struct(struct vm_area_struct *vma) +{ + struct file * file = vma->vm_file; + + if (file) { + struct inode *inode = file->f_dentry->d_inode; + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&inode->i_writecount); + if(vma->vm_next_share) + vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; + *vma->vm_pprev_share = vma->vm_next_share; + } + + /* Deal with vm_sharing_data */ + arch_remove_shared_vm_struct(vma); +} + +static inline void remove_shared_vm_struct(struct vm_area_struct *vma) +{ + lock_vma_mappings(vma); + __remove_shared_vm_struct(vma); + unlock_vma_mappings(vma); +} + +void lock_vma_mappings(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + mapping = NULL; + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping) + spin_lock(&mapping->i_shared_lock); +} + +void unlock_vma_mappings(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + mapping = NULL; + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping) + spin_unlock(&mapping->i_shared_lock); +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against rlimit.. */ + rlim = current->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Check if we have enough memory.. */ + if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used + * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits + * into "VM_xxx". + */ +static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags) +{ +#define _trans(x,bit1,bit2) \ +((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0) + + unsigned long prot_bits, flag_bits; + prot_bits = + _trans(prot, PROT_READ, VM_READ) | + _trans(prot, PROT_WRITE, VM_WRITE) | + _trans(prot, PROT_EXEC, VM_EXEC); + flag_bits = + _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) | + _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) | + _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE); + return prot_bits | flag_bits; +#undef _trans +} + +#ifdef DEBUG_MM_RB +static int browse_rb(rb_node_t * rb_node) { + int i = 0; + if (rb_node) { + i++; + i += browse_rb(rb_node->rb_left); + i += browse_rb(rb_node->rb_right); + } + return i; +} + +static void validate_mm(struct mm_struct * mm) { + int bug = 0; + int i = 0; + struct vm_area_struct * tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(mm->mm_rb.rb_node); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (bug) + BUG(); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct ** pprev, + rb_node_t *** rb_link, rb_node_t ** rb_parent) +{ + struct vm_area_struct * vma; + rb_node_t ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + return vma; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t * rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static inline void __vma_link_file(struct vm_area_struct * vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct inode * inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct vm_area_struct **head; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + head = &mapping->i_mmap; + if (vma->vm_flags & VM_SHARED) + head = &mapping->i_mmap_shared; + + /* insert vma into inode's share list */ + if((vma->vm_next_share = *head) != NULL) + (*head)->vm_pprev_share = &vma->vm_next_share; + *head = vma; + vma->vm_pprev_share = head; + } +} + +static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __vma_link_file(vma); +} + +static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + __vma_link(mm, vma, prev, rb_link, rb_parent); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + + mm->map_count++; + validate_mm(mm); +} + +static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev, + rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags) +{ + spinlock_t * lock = &mm->page_table_lock; + if (!prev) { + prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + goto merge_next; + } + if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) { + struct vm_area_struct * next; + + spin_lock(lock); + prev->vm_end = end; + next = prev->vm_next; + if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) { + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + return 1; + } + spin_unlock(lock); + return 1; + } + + prev = prev->vm_next; + if (prev) { + merge_next: + if (!can_vma_merge(prev, vm_flags)) + return 0; + if (end == prev->vm_start) { + spin_lock(lock); + prev->vm_start = addr; + spin_unlock(lock); + return 1; + } + } + + return 0; +} + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned int vm_flags; + int correct_wcount = 0; + int error; + rb_node_t ** rb_link, * rb_parent; + + if (file) { + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + + if ((prot & PROT_EXEC) && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) + return -EPERM; + } + + if (!len) + return addr; + + len = PAGE_ALIGN(len); + + if (len > TASK_SIZE || len == 0) + return -EINVAL; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EINVAL; + + /* Too many mappings? */ + if (mm->map_count > max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* Make sure we don't allow writing to an append-only file.. */ + if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* make sure there are no mandatory locks on the file. */ + if (locks_verify_locked(file->f_dentry->d_inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + break; + + default: + return -EINVAL; + } + } else { + vm_flags |= VM_SHARED | VM_MAYSHARE; + switch (flags & MAP_TYPE) { + default: + return -EINVAL; + case MAP_PRIVATE: + vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + /* fall through */ + case MAP_SHARED: + break; + } + } + + /* Clear old maps */ +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + /* Private writable mapping? Check memory availability.. */ + if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + /* Can we just expand an old anonymous mapping? */ + if (!file && !(vm_flags & VM_SHARED) && rb_parent) + if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags)) + goto out; + + /* Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = pgoff; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + vma->vm_sharing_data = NULL; + vma->vm_raend = 0; + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + } else if (flags & MAP_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + if (addr != vma->vm_start) { + /* + * It is a bit too late to pretend changing the virtual + * area of the mapping, we just corrupted userspace + * in the do_munmap, so FIXME (not in 2.4 to avoid breaking + * the driver API). + */ + struct vm_area_struct * stale_vma; + /* Since addr changed, we rely on the mmap op to prevent + * collisions with existing vmas and just use find_vma_prepare + * to update the tree pointers. + */ + addr = vma->vm_start; + stale_vma = find_vma_prepare(mm, addr, &prev, + &rb_link, &rb_parent); + /* + * Make sure the lowlevel driver did its job right. + */ + if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) { + printk(KERN_ERR "buggy mmap operation: [<%p>]\n", + file ? file->f_op->mmap : NULL); + BUG(); + } + } + + vma_link(mm, vma, prev, rb_link, rb_parent); + if (correct_wcount) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); +free_vma: + kmem_cache_free(vm_area_cachep, vma); + return error; +} + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +static inline unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int found_hole = 0; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + if (!vma || addr + len <= vma->vm_start) + return addr; + addr = vma->vm_end; + } +} +#else +extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#endif + +unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + unsigned long retval; + + if (flags & MAP_FIXED) { + if (addr > TASK_SIZE - len || addr >= TASK_SIZE) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + return addr; + } + + if (file && file->f_op && file->f_op->get_unmapped_area) { + retval = file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + /* -ENOSYS will be returned if the device-specific driver + * does not implement this function. e.g. framebuffer drivers + */ + if (retval != -ENOSYS) + return retval; + } + + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + rb_node_t * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + if (mm) { + /* Go through the RB tree quickly. */ + struct vm_area_struct * vma; + rb_node_t * rb_node, * rb_last_right, * rb_prev; + + rb_node = mm->mm_rb.rb_node; + rb_last_right = rb_prev = NULL; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + rb_prev = rb_last_right; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else { + rb_last_right = rb_node; + rb_node = rb_node->rb_right; + } + } + if (vma) { + if (vma->vm_rb.rb_left) { + rb_prev = vma->vm_rb.rb_left; + while (rb_prev->rb_right) + rb_prev = rb_prev->rb_right; + } + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma) + BUG(); + return vma; + } + } + *pprev = NULL; + return NULL; +} + +struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} + +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. If possible, we reuse the existing area rather than + * allocate a new one, and the return indicates whether the old + * area was reused. + */ +static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, + struct vm_area_struct *area, unsigned long addr, size_t len, + struct vm_area_struct *extra) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + + /* Unmapping the whole area. */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_file) + fput(area->vm_file); + kmem_cache_free(vm_area_cachep, area); + return extra; + } + + /* Work out to one of the ends. */ + if (end == area->vm_end) { + /* + * here area isn't visible to the semaphore-less readers + * so we don't need to update it under the spinlock. + */ + area->vm_end = addr; + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + } else if (addr == area->vm_start) { + area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; + /* same locking considerations of the above case */ + area->vm_start = end; + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + } else { + /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ + /* Add end mapping -- leave beginning for below */ + mpnt = extra; + extra = NULL; + + mpnt->vm_mm = area->vm_mm; + mpnt->vm_start = end; + mpnt->vm_end = area->vm_end; + mpnt->vm_page_prot = area->vm_page_prot; + mpnt->vm_flags = area->vm_flags; + mpnt->vm_raend = 0; + mpnt->vm_ops = area->vm_ops; + mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT); + mpnt->vm_file = area->vm_file; + mpnt->vm_private_data = area->vm_private_data; + mpnt->vm_sharing_data = NULL; + if (mpnt->vm_file) + get_file(mpnt->vm_file); + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + + /* Because mpnt->vm_file == area->vm_file this locks + * things correctly. + */ + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + __insert_vm_struct(mm, mpnt); + } + + __insert_vm_struct(mm, area); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(area); + return extra; +} + +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = end + PGDIR_SIZE - 1; + unsigned long start_index, end_index; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end + PGDIR_SIZE - 1; + break; + } +no_mmaps: + if (last < first) + return; + /* + * If the PGD bits are not consecutive in the virtual address, the + * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. + */ + start_index = pgd_index(first); + end_index = pgd_index(last); + if (end_index > start_index) { + clear_page_tables(mm, start_index, end_index - start_index); + flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); + } +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine <jeremy@sw.oz.au> + */ +int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; + + if ((addr & ~PAGE_MASK) || addr >= TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + mpnt = find_vma_prev(mm, addr, &prev); + if (!mpnt) + return 0; + /* we have addr < mpnt->vm_end */ + + if (mpnt->vm_start >= addr+len) + return 0; + + /* If we'll make "hole", check the vm areas limit */ + if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) + && mm->map_count >= max_map_count) + return -ENOMEM; + + /* + * We may need one additional vma to fix up the mappings ... + * and this is the last chance for an easy error exit. + */ + extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!extra) + return -ENOMEM; + + npp = (prev ? &prev->vm_next : &mm->mmap); + free = NULL; + spin_lock(&mm->page_table_lock); + for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + rb_erase(&mpnt->vm_rb, &mm->mm_rb); + } + mm->mmap_cache = NULL; /* Kill the cache. */ + spin_unlock(&mm->page_table_lock); + + /* Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + * In that case we have to be careful with VM_DENYWRITE. + */ + while ((mpnt = free) != NULL) { + unsigned long st, end, size; + struct file *file = NULL; + + free = free->vm_next; + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + size = end - st; + + if (mpnt->vm_flags & VM_DENYWRITE && + (st != mpnt->vm_start || end != mpnt->vm_end) && + (file = mpnt->vm_file) != NULL) { + atomic_dec(&file->f_dentry->d_inode->i_writecount); + } + remove_shared_vm_struct(mpnt); + mm->map_count--; + + zap_page_range(mm, st, size); + + /* + * Fix the mapping, and free the old area if it wasn't reused. + */ + extra = unmap_fixup(mm, mpnt, st, size, extra); + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + } + validate_mm(mm); + + /* Release the extra vma struct if it wasn't used */ + if (extra) + kmem_cache_free(vm_area_cachep, extra); + + free_pgtables(mm, prev, addr, addr+len); + + return 0; +} + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + + +static inline void verify_mmap_write_lock_held(struct mm_struct *mm) +{ + if (down_read_trylock(&mm->mmap_sem)) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + rb_node_t ** rb_link, * rb_parent; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + if ((addr + len) > TASK_SIZE || (addr + len) < addr) + return -EINVAL; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + /* + * mm->mmap_sem is required to protect against another thread + * changing the mappings while we sleep (on kmalloc for one). + */ + verify_mmap_write_lock_held(mm); + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > max_map_count) + return -ENOMEM; + + if (!vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags; + + /* Can we just expand an old anonymous mapping? */ + if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags)) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = flags; + vma->vm_page_prot = protection_map[flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = 0; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + vma->vm_sharing_data = NULL; + + vma_link(mm, vma, prev, rb_link, rb_parent); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + +/* Build the RB tree corresponding to the VMA list. */ +void build_mmap_rb(struct mm_struct * mm) +{ + struct vm_area_struct * vma; + rb_node_t ** rb_link, * rb_parent; + + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + __vma_link_rb(mm, vma, rb_link, rb_parent); + rb_parent = &vma->vm_rb; + rb_link = &rb_parent->rb_right; + } +} + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct * mm) +{ + struct vm_area_struct * mpnt; + + release_segments(mm); + spin_lock(&mm->page_table_lock); + mpnt = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + mm->rss = 0; + spin_unlock(&mm->page_table_lock); + mm->total_vm = 0; + mm->locked_vm = 0; + + flush_cache_mm(mm); + while (mpnt) { + struct vm_area_struct * next = mpnt->vm_next; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long size = end - start; + + if (mpnt->vm_ops) { + if (mpnt->vm_ops->close) + mpnt->vm_ops->close(mpnt); + } + mm->map_count--; + remove_shared_vm_struct(mpnt); + zap_page_range(mm, start, size); + if (mpnt->vm_file) + fput(mpnt->vm_file); + kmem_cache_free(vm_area_cachep, mpnt); + mpnt = next; + } + + /* This is just debugging */ + if (mm->map_count) + BUG(); + + clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); + + flush_tlb_mm(mm); +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap ring. If vm_file is non-NULL + * then the i_shared_lock must be held here. + */ +void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + rb_node_t ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; + validate_mm(mm); +} + +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + rb_node_t ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + return -ENOMEM; + vma_link(mm, vma, prev, rb_link, rb_parent); + validate_mm(mm); + return 0; +} diff --git a/uClinux-2.4.31-uc0/mm/mprotect.c b/uClinux-2.4.31-uc0/mm/mprotect.c new file mode 100644 index 0000000..cd45ed1 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/mprotect.c @@ -0,0 +1,337 @@ +/* + * linux/mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + */ +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/shm.h> +#include <linux/mman.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> + +static inline void change_pte_range(pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + if (pte_present(*pte)) { + pte_t entry; + + /* Avoid an SMP race with hardware updated dirty/clean + * bits by wiping the pte and then setting the new pte + * into place. + */ + entry = ptep_get_and_clear(pte); + set_pte(pte, pte_modify(entry, newprot)); + } + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline void change_pmd_range(pgd_t * pgd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + change_pte_range(pmd, address, end - address, newprot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +static void change_protection(unsigned long start, unsigned long end, pgprot_t newprot) +{ + pgd_t *dir; + unsigned long beg = start; + + dir = pgd_offset(current->mm, start); + flush_cache_range(current->mm, beg, end); + if (start >= end) + BUG(); + spin_lock(¤t->mm->page_table_lock); + do { + change_pmd_range(dir, start, end - start, newprot); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (start && (start < end)); + spin_unlock(¤t->mm->page_table_lock); + flush_tlb_range(current->mm, beg, end); + return; +} + +static inline int mprotect_fixup_all(struct vm_area_struct * vma, struct vm_area_struct ** pprev, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * prev = *pprev; + struct mm_struct * mm = vma->vm_mm; + + if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) && + !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = vma->vm_end; + __vma_unlink(mm, vma, prev); + spin_unlock(&mm->page_table_lock); + + kmem_cache_free(vm_area_cachep, vma); + mm->map_count--; + + return 0; + } + + spin_lock(&mm->page_table_lock); + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + spin_unlock(&mm->page_table_lock); + + *pprev = vma; + + return 0; +} + +static inline int mprotect_fixup_start(struct vm_area_struct * vma, struct vm_area_struct ** pprev, + unsigned long end, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * n, * prev = *pprev; + + *pprev = vma; + + if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) && + !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&vma->vm_mm->page_table_lock); + prev->vm_end = end; + vma->vm_start = end; + spin_unlock(&vma->vm_mm->page_table_lock); + + return 0; + } + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + n->vm_end = end; + n->vm_flags = newflags; + n->vm_raend = 0; + n->vm_page_prot = prot; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_start = end; + __insert_vm_struct(current->mm, n); + spin_unlock(&vma->vm_mm->page_table_lock); + unlock_vma_mappings(vma); + + return 0; +} + +static inline int mprotect_fixup_end(struct vm_area_struct * vma, struct vm_area_struct ** pprev, + unsigned long start, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * n; + + n = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + n->vm_flags = newflags; + n->vm_raend = 0; + n->vm_page_prot = prot; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + lock_vma_mappings(vma); + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_end = start; + __insert_vm_struct(current->mm, n); + spin_unlock(&vma->vm_mm->page_table_lock); + unlock_vma_mappings(vma); + + *pprev = n; + + return 0; +} + +static inline int mprotect_fixup_middle(struct vm_area_struct * vma, struct vm_area_struct ** pprev, + unsigned long start, unsigned long end, + int newflags, pgprot_t prot) +{ + struct vm_area_struct * left, * right; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -ENOMEM; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -ENOMEM; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; + if (vma->vm_file) + atomic_add(2,&vma->vm_file->f_count); + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + vma->vm_raend = 0; + vma->vm_page_prot = prot; + lock_vma_mappings(vma); + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_flags = newflags; + __insert_vm_struct(current->mm, left); + __insert_vm_struct(current->mm, right); + spin_unlock(&vma->vm_mm->page_table_lock); + unlock_vma_mappings(vma); + + *pprev = right; + + return 0; +} + +static int mprotect_fixup(struct vm_area_struct * vma, struct vm_area_struct ** pprev, + unsigned long start, unsigned long end, unsigned int newflags) +{ + pgprot_t newprot; + int error; + + if (newflags == vma->vm_flags) { + *pprev = vma; + return 0; + } + newprot = protection_map[newflags & 0xf]; + if (start == vma->vm_start) { + if (end == vma->vm_end) + error = mprotect_fixup_all(vma, pprev, newflags, newprot); + else + error = mprotect_fixup_start(vma, pprev, end, newflags, newprot); + } else if (end == vma->vm_end) + error = mprotect_fixup_end(vma, pprev, start, newflags, newprot); + else + error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot); + + if (error) + return error; + + change_protection(start, end, newprot); + return 0; +} + +asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * next, * prev; + int error = -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -ENOMEM; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) + return -EINVAL; + if (end == start) + return 0; + + down_write(¤t->mm->mmap_sem); + + vma = find_vma_prev(current->mm, start, &prev); + error = -ENOMEM; + if (!vma || vma->vm_start > start) + goto out; + + for (nstart = start ; ; ) { + unsigned int newflags; + int last = 0; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + goto out; + } + + if (vma->vm_end > end) { + error = mprotect_fixup(vma, &prev, nstart, end, newflags); + goto out; + } + if (vma->vm_end == end) + last = 1; + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + if (last) + break; + nstart = tmp; + vma = next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } + if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) && + !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { + spin_lock(&prev->vm_mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(prev->vm_mm, next, prev); + spin_unlock(&prev->vm_mm->page_table_lock); + + kmem_cache_free(vm_area_cachep, next); + prev->vm_mm->map_count--; + } +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff --git a/uClinux-2.4.31-uc0/mm/mremap.c b/uClinux-2.4.31-uc0/mm/mremap.c new file mode 100644 index 0000000..416dd4b --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/mremap.c @@ -0,0 +1,383 @@ +/* + * linux/mm/remap.c + * + * (C) Copyright 1996 Linus Torvalds + */ + +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/swap.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +extern int vm_enough_memory(long pages); + +static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + goto end; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + goto end; + } + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto end; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto end; + } + + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) + pte = NULL; +end: + return pte; +} + +static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pmd_t * pmd; + pte_t * pte = NULL; + + pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + if (pmd) + pte = pte_alloc(mm, pmd, addr); + return pte; +} + +static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) +{ + int error = 0; + pte_t pte; + + if (!pte_none(*src)) { + pte = ptep_get_and_clear(src); + if (!dst) { + /* No dest? We must put it back. */ + dst = src; + error++; + } + set_pte(dst, pte); + } + return error; +} + +static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +{ + int error = 0; + pte_t * src, * dst; + + spin_lock(&mm->page_table_lock); + src = get_one_pte(mm, old_addr); + if (src) { + dst = alloc_one_pte(mm, new_addr); + src = get_one_pte(mm, old_addr); + if (src) + error = copy_one_pte(mm, src, dst); + } + spin_unlock(&mm->page_table_lock); + return error; +} + +static int move_page_tables(struct mm_struct * mm, + unsigned long new_addr, unsigned long old_addr, unsigned long len) +{ + unsigned long offset = len; + + flush_cache_range(mm, old_addr, old_addr + len); + + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + while (offset) { + offset -= PAGE_SIZE; + if (move_one_page(mm, old_addr + offset, new_addr + offset)) + goto oops_we_failed; + } + flush_tlb_range(mm, old_addr, old_addr + len); + return 0; + + /* + * Ok, the move failed because we didn't have enough pages for + * the new page table tree. This is unlikely, but we have to + * take the possibility into account. In that case we just move + * all the pages back (this will work, because we still have + * the old page tables) + */ +oops_we_failed: + flush_cache_range(mm, new_addr, new_addr + len); + while ((offset += PAGE_SIZE) < len) + move_one_page(mm, new_addr + offset, old_addr + offset); + zap_page_range(mm, new_addr, len); + return -1; +} + +static inline unsigned long move_vma(struct vm_area_struct * vma, + unsigned long addr, unsigned long old_len, unsigned long new_len, + unsigned long new_addr) +{ + struct mm_struct * mm = vma->vm_mm; + struct vm_area_struct * new_vma, * next, * prev; + int allocated_vma; + + new_vma = NULL; + next = find_vma_prev(mm, new_addr, &prev); + if (next) { + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + if (next != prev->vm_next) + BUG(); + if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(&mm->page_table_lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + } + } else if (next->vm_start == new_addr + new_len && + can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + next->vm_start = new_addr; + spin_unlock(&mm->page_table_lock); + new_vma = next; + } + } else { + prev = find_vma(mm, new_addr-1); + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + } + } + + allocated_vma = 0; + if (!new_vma) { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new_vma) + goto out; + allocated_vma = 1; + } + + if (!move_page_tables(current->mm, new_addr, addr, old_len)) { + unsigned long vm_locked = vma->vm_flags & VM_LOCKED; + + if (allocated_vma) { + *new_vma = *vma; + new_vma->vm_start = new_addr; + new_vma->vm_end = new_addr+new_len; + new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_raend = 0; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + insert_vm_struct(current->mm, new_vma); + } + + /* XXX: possible errors masked, mapping might remain */ + do_munmap(current->mm, addr, old_len); + + current->mm->total_vm += new_len >> PAGE_SHIFT; + if (vm_locked) { + current->mm->locked_vm += new_len >> PAGE_SHIFT; + if (new_len > old_len) + make_pages_present(new_addr + old_len, + new_addr + new_len); + } + return new_addr; + } + if (allocated_vma) + kmem_cache_free(vm_area_cachep, new_vma); + out: + return -ENOMEM; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; + + if (addr & ~PAGE_MASK) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + if (old_len > TASK_SIZE || addr > TASK_SIZE - old_len) + goto out; + + if (addr >= TASK_SIZE) + goto out; + + /* new_addr is only valid if MREMAP_FIXED is specified */ + if (flags & MREMAP_FIXED) { + if (new_addr & ~PAGE_MASK) + goto out; + if (!(flags & MREMAP_MAYMOVE)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + if (new_addr >= TASK_SIZE) + goto out; + + /* + * Allow new_len == 0 only if new_addr == addr + * to preserve truncation in place (that was working + * safe and some app may depend on it). + */ + if (unlikely(!new_len && new_addr != addr)) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = do_munmap(current->mm, new_addr, new_len); + if (ret && new_len) + goto out; + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + */ + if (old_len >= new_len) { + ret = do_munmap(current->mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + ret = addr; + if (!(flags & MREMAP_FIXED) || (new_addr == addr)) + goto out; + } + + /* + * Ok, we need to grow.. or relocate. + */ + ret = -EFAULT; + vma = find_vma(current->mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto out; + if (vma->vm_flags & VM_DONTEXPAND) { + if (new_len > old_len) + goto out; + } + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + locked += new_len - old_len; + ret = -EAGAIN; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + goto out; + } + ret = -ENOMEM; + if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + > current->rlim[RLIMIT_AS].rlim_cur) + goto out; + /* Private writable mapping? Check memory availability.. */ + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) + goto out; + + /* old_len exactly to the end of the area.. + * And we're not relocating the area. + */ + if (old_len == vma->vm_end - addr && + !((flags & MREMAP_FIXED) && (addr != new_addr)) && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_end = addr + new_len; + spin_unlock(&vma->vm_mm->page_table_lock); + current->mm->total_vm += pages; + if (vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += pages; + make_pages_present(addr + old_len, + addr + new_len); + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + if (!(flags & MREMAP_FIXED)) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_SHARED) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); + ret = new_addr; + if (new_addr & ~PAGE_MASK) + goto out; + } + ret = move_vma(vma, addr, old_len, new_len, new_addr); + } +out: + return ret; +} + +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} diff --git a/uClinux-2.4.31-uc0/mm/numa.c b/uClinux-2.4.31-uc0/mm/numa.c new file mode 100644 index 0000000..0b602ef --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/numa.c @@ -0,0 +1,130 @@ +/* + * Written by Kanoj Sarcar, SGI, Aug 1999 + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <linux/spinlock.h> + +int numnodes = 1; /* Initialized for UMA platforms */ + +static bootmem_data_t contig_bootmem_data; +pg_data_t contig_page_data = { bdata: &contig_bootmem_data }; + +#ifndef CONFIG_DISCONTIGMEM + +/* + * This is meant to be invoked by platforms whose physical memory starts + * at a considerably higher value than 0. Examples are Super-H, ARM, m68k. + * Should be invoked with paramters (0, 0, unsigned long *[], start_paddr). + */ +void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long *zones_size, unsigned long zone_start_paddr, + unsigned long *zholes_size) +{ + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, + zone_start_paddr, zholes_size, pmap); +} + +#endif /* !CONFIG_DISCONTIGMEM */ + +struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order) +{ +#ifdef CONFIG_NUMA + return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); +#else + return alloc_pages(gfp_mask, order); +#endif +} + +#ifdef CONFIG_DISCONTIGMEM + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static spinlock_t node_lock = SPIN_LOCK_UNLOCKED; + +void show_free_areas_node(pg_data_t *pgdat) +{ + unsigned long flags; + + spin_lock_irqsave(&node_lock, flags); + show_free_areas_core(pgdat); + spin_unlock_irqrestore(&node_lock, flags); +} + +/* + * Nodes can be initialized parallely, in no particular order. + */ +void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long *zones_size, unsigned long zone_start_paddr, + unsigned long *zholes_size) +{ + int i, size = 0; + struct page *discard; + + if (mem_map == (mem_map_t *)NULL) + mem_map = (mem_map_t *)PAGE_OFFSET; + + free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr, + zholes_size, pmap); + pgdat->node_id = nid; + + /* + * Get space for the valid bitmap. + */ + for (i = 0; i < MAX_NR_ZONES; i++) + size += zones_size[i]; + size = LONG_ALIGN((size + 7) >> 3); + pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size); + memset(pgdat->valid_addr_bitmap, 0, size); +} + +static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask, + unsigned int order) +{ + return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK)); +} + +/* + * This can be refined. Currently, tries to do round robin, instead + * should do concentratic circle search, starting from current node. + */ +struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + struct page *ret = 0; + pg_data_t *start, *temp; +#ifndef CONFIG_NUMA + unsigned long flags; + static pg_data_t *next = 0; +#endif + + if (order >= MAX_ORDER) + return NULL; +#ifdef CONFIG_NUMA + temp = NODE_DATA(numa_node_id()); +#else + spin_lock_irqsave(&node_lock, flags); + if (!next) next = pgdat_list; + temp = next; + next = next->node_next; + spin_unlock_irqrestore(&node_lock, flags); +#endif + start = temp; + while (temp) { + if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) + return(ret); + temp = temp->node_next; + } + temp = pgdat_list; + while (temp != start) { + if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) + return(ret); + temp = temp->node_next; + } + return(0); +} + +#endif /* CONFIG_DISCONTIGMEM */ diff --git a/uClinux-2.4.31-uc0/mm/oom_kill.c b/uClinux-2.4.31-uc0/mm/oom_kill.c new file mode 100644 index 0000000..5e461a7 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/oom_kill.c @@ -0,0 +1,298 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from kswapd() + * in linux/mm/vmscan.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/timex.h> + +/* #define DEBUG */ + +/** + * int_sqrt - oom_kill.c internal function, rough approximation to sqrt + * @x: integer of which to calculate the sqrt + * + * A very rough approximation to the sqrt() function. + */ +static unsigned int int_sqrt(unsigned int x) +{ + unsigned int out = x; + while (x & ~(unsigned int)1) x >>=2, out >>=1; + if (x) out -= out >> 2; + return (out ? out : 1); +} + +/** + * oom_badness - calculate a numeric value for how bad this task has been + * @p: task struct of which task we should calculate + * + * The formula used is relatively simple and documented inline in the + * function. The main rationale is that we want to select a good task + * to kill when we run out of memory. + * + * Good in this context means that: + * 1) we lose the minimum amount of work done + * 2) we recover a large amount of memory + * 3) we don't kill anything innocent of eating tons of memory + * 4) we want to kill the minimum amount of processes (one) + * 5) we try to kill the process the user expects us to kill, this + * algorithm has been meticulously tuned to meet the priniciple + * of least surprise ... (be careful when you change it) + */ + +static int badness(struct task_struct *p) +{ + int points, cpu_time, run_time; + + if (!p->mm) + return 0; + + if (p->flags & PF_MEMDIE) + return 0; + + /* + * The memory size of the process is the basis for the badness. + */ + points = p->mm->total_vm; + + /* + * CPU time is in seconds and run time is in minutes. There is no + * particular reason for this other than that it turned out to work + * very well in practice. This is not safe against jiffie wraps + * but we don't care _that_ much... + */ + cpu_time = (p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3); + run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10); + + points /= int_sqrt(cpu_time); + points /= int_sqrt(int_sqrt(run_time)); + + /* + * Niced processes are most likely less important, so double + * their badness points. + */ + if (p->nice > 0) + points *= 2; + + /* + * Superuser processes are usually more important, so we make it + * less likely that we kill those. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || + p->uid == 0 || p->euid == 0) + points /= 4; + + /* + * We don't want to kill a process with direct hardware access. + * Not only could that mess up the hardware, but usually users + * tend to only have this flag set on applications they think + * of as important. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + points /= 4; +#ifdef DEBUG + printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + p->pid, p->comm, points); +#endif + return points; +} + +/* + * Simple selection loop. We chose the process with the highest + * number of 'points'. We expect the caller will lock the tasklist. + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +static struct task_struct * select_bad_process(void) +{ + int maxpoints = 0; + struct task_struct *p = NULL; + struct task_struct *chosen = NULL; + + for_each_task(p) { + if (p->pid) { + int points = badness(p); + if (points > maxpoints) { + chosen = p; + maxpoints = points; + } + } + } + return chosen; +} + +/** + * We must be careful though to never send SIGKILL a process with + * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that + * we select a process with CAP_SYS_RAW_IO set). + */ +static void __oom_kill_task(struct task_struct *p) +{ + printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); + + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... + */ + p->counter = 5 * HZ; + p->flags |= PF_MEMALLOC | PF_MEMDIE; + + /* This process has hardware access, be more careful. */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) { + force_sig(SIGTERM, p); + } else { + force_sig(SIGKILL, p); + } +} + +static struct mm_struct *oom_kill_task(struct task_struct *p) +{ + struct mm_struct *mm; + + task_lock(p); + mm = p->mm; + if (mm) { + spin_lock(&mmlist_lock); + if (atomic_read(&mm->mm_users)) + atomic_inc(&mm->mm_users); + else + mm = NULL; + spin_unlock(&mmlist_lock); + } + task_unlock(p); + if (mm) + __oom_kill_task(p); + return mm; +} + +/** + * oom_kill - kill the "best" process when we run out of memory + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + */ +static void oom_kill(void) +{ + struct task_struct *p, *q; + struct mm_struct *mm; + +retry: + read_lock(&tasklist_lock); + p = select_bad_process(); + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (p == NULL) + panic("Out of memory and no killable processes...\n"); + mm = oom_kill_task(p); + if (!mm) { + read_unlock(&tasklist_lock); + goto retry; + } + /* kill all processes that share the ->mm (i.e. all threads) */ + for_each_task(q) { + if (q->mm == mm) + __oom_kill_task(q); + } + read_unlock(&tasklist_lock); + mmput(mm); + /* + * Make kswapd go out of the way, so "p" has a good chance of + * killing itself before someone else gets the chance to ask + * for more memory. + */ + yield(); + return; +} + +/** + * out_of_memory - is the system out of memory? + */ +void out_of_memory(void) +{ + /* + * oom_lock protects out_of_memory()'s static variables. + * It's a global lock; this is not performance-critical. + */ + static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED; + static unsigned long first, last, count, lastkill; + unsigned long now, since; + + /* + * Enough swap space left? Not OOM. + */ + if (nr_swap_pages > 0) + return; + + spin_lock(&oom_lock); + now = jiffies; + since = now - last; + last = now; + + /* + * If it's been a long time since last failure, + * we're not oom. + */ + last = now; + if (since > 5*HZ) + goto reset; + + /* + * If we haven't tried for at least one second, + * we're not really oom. + */ + since = now - first; + if (since < HZ) + goto out_unlock; + + /* + * If we have gotten only a few failures, + * we're not really oom. + */ + if (++count < 10) + goto out_unlock; + + /* + * If we just killed a process, wait a while + * to give that task a chance to exit. This + * avoids killing multiple processes needlessly. + */ + since = now - lastkill; + if (since < HZ*5) + goto out_unlock; + + /* + * Ok, really out of memory. Kill something. + */ + lastkill = now; + + /* oom_kill() can sleep */ + spin_unlock(&oom_lock); + oom_kill(); + spin_lock(&oom_lock); + +reset: + if ((long)first - (long)now < 0) + first = now; + count = 0; + +out_unlock: + spin_unlock(&oom_lock); +} diff --git a/uClinux-2.4.31-uc0/mm/page_alloc.c b/uClinux-2.4.31-uc0/mm/page_alloc.c new file mode 100644 index 0000000..6e45cd5 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/page_alloc.c @@ -0,0 +1,969 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/interrupt.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/module.h> + +int nr_swap_pages; +int nr_active_pages; +int nr_inactive_pages; +LIST_HEAD(inactive_list); +LIST_HEAD(active_list); +pg_data_t *pgdat_list; + +/* + * + * The zone_table array is used to look up the address of the + * struct zone corresponding to a given zone number (ZONE_DMA, + * ZONE_NORMAL, or ZONE_HIGHMEM). + */ +zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; + +int vm_gfp_debug = 0; + +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); + +static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED; +struct page * free_pages_ok_no_irq_head; + +static void do_free_pages_ok_no_irq(void * arg) +{ + struct page * page, * __page; + + spin_lock_irq(&free_pages_ok_no_irq_lock); + + page = free_pages_ok_no_irq_head; + free_pages_ok_no_irq_head = NULL; + + spin_unlock_irq(&free_pages_ok_no_irq_lock); + + while (page) { + __page = page; + page = page->next_hash; + __free_pages_ok(__page, __page->index); + } +} + +static struct tq_struct free_pages_ok_no_irq_task = { + .routine = do_free_pages_ok_no_irq, +}; + + +/* + * Temporary debugging check. + */ +#define BAD_RANGE(zone, page) \ +( \ + (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ + || (((page) - mem_map) < (zone)->zone_start_mapnr) \ + || ((zone) != page_zone(page)) \ +) + +/* + * Freeing function for a buddy system allocator. + * Contrary to prior comments, this is *NOT* hairy, and there + * is no reason for anyone not to understand it. + * + * The concept of a buddy system is to maintain direct-mapped tables + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep one bit for each pair of blocks, which + * is set to 1 iff only one of the pair is allocated. So when we + * are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static void fastcall __free_pages_ok (struct page *page, unsigned int order) +{ + unsigned long index, page_idx, mask, flags; + free_area_t *area; + struct page *base; + zone_t *zone; + + /* + * Yes, think what happens when other parts of the kernel take + * a reference to a page in order to pin it for io. -ben + */ + if (PageLRU(page)) { + if (unlikely(in_interrupt())) { + unsigned long flags; + + spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags); + page->next_hash = free_pages_ok_no_irq_head; + free_pages_ok_no_irq_head = page; + page->index = order; + + spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags); + + schedule_task(&free_pages_ok_no_irq_task); + return; + } + + lru_cache_del(page); + } + + if (page->buffers) + BUG(); + if (page->mapping) + BUG(); + if (!VALID_PAGE(page)) + BUG(); + if (PageLocked(page)) + BUG(); + if (PageActive(page)) + BUG(); + ClearPageReferenced(page); + ClearPageDirty(page); + + if (current->flags & PF_FREE_PAGES) + goto local_freelist; + back_local_freelist: + + zone = page_zone(page); + + mask = (~0UL) << order; + base = zone->zone_mem_map; + page_idx = page - base; + if (page_idx & ~mask) + BUG(); + index = page_idx >> (1 + order); + + area = zone->free_area + order; + + spin_lock_irqsave(&zone->lock, flags); + + zone->free_pages -= mask; + + while (mask + (1 << (MAX_ORDER-1))) { + struct page *buddy1, *buddy2; + + if (area >= zone->free_area + MAX_ORDER) + BUG(); + if (!__test_and_change_bit(index, area->map)) + /* + * the buddy page is still allocated. + */ + break; + /* + * Move the buddy up one level. + * This code is taking advantage of the identity: + * -mask = 1+~mask + */ + buddy1 = base + (page_idx ^ -mask); + buddy2 = base + page_idx; + if (BAD_RANGE(zone,buddy1)) + BUG(); + if (BAD_RANGE(zone,buddy2)) + BUG(); + + list_del(&buddy1->list); + mask <<= 1; + area++; + index >>= 1; + page_idx &= mask; + } + list_add(&(base + page_idx)->list, &area->free_list); + + spin_unlock_irqrestore(&zone->lock, flags); + return; + + local_freelist: + if (current->nr_local_pages) + goto back_local_freelist; + if (in_interrupt()) + goto back_local_freelist; + + list_add(&page->list, ¤t->local_pages); + page->index = order; + current->nr_local_pages++; +} + +#define MARK_USED(index, order, area) \ + __change_bit((index) >> (1+(order)), (area)->map) + +static inline struct page * expand (zone_t *zone, struct page *page, + unsigned long index, int low, int high, free_area_t * area) +{ + unsigned long size = 1 << high; + + while (high > low) { + if (BAD_RANGE(zone,page)) + BUG(); + area--; + high--; + size >>= 1; + list_add(&(page)->list, &(area)->free_list); + MARK_USED(index, high, area); + index += size; + page += size; + } + if (BAD_RANGE(zone,page)) + BUG(); + return page; +} + +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); +static struct page * fastcall rmqueue(zone_t *zone, unsigned int order) +{ + free_area_t * area = zone->free_area + order; + unsigned int curr_order = order; + struct list_head *head, *curr; + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&zone->lock, flags); + do { + head = &area->free_list; + curr = head->next; + + if (curr != head) { + unsigned int index; + + page = list_entry(curr, struct page, list); + if (BAD_RANGE(zone,page)) + BUG(); + list_del(curr); + index = page - zone->zone_mem_map; + if (curr_order != MAX_ORDER-1) + MARK_USED(index, curr_order, area); + zone->free_pages -= 1UL << order; + + page = expand(zone, page, index, order, curr_order, area); + spin_unlock_irqrestore(&zone->lock, flags); + + set_page_count(page, 1); + if (BAD_RANGE(zone,page)) + BUG(); + if (PageLRU(page)) + BUG(); + if (PageActive(page)) + BUG(); + return page; + } + curr_order++; + area++; + } while (curr_order < MAX_ORDER); + spin_unlock_irqrestore(&zone->lock, flags); + + return NULL; +} + +#ifndef CONFIG_DISCONTIGMEM +struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + return __alloc_pages(gfp_mask, order, + contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); +} +#endif + +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); +static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +{ + struct page * page = NULL; + int __freed; + + if (in_interrupt()) + BUG(); + + current->allocation_order = order; + current->flags |= PF_MEMALLOC | PF_FREE_PAGES; + + __freed = try_to_free_pages_zone(classzone, gfp_mask); + + current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); + + if (current->nr_local_pages) { + struct list_head * entry, * local_pages; + struct page * tmp; + int nr_pages; + + local_pages = ¤t->local_pages; + + if (likely(__freed)) { + /* pick from the last inserted so we're lifo */ + entry = local_pages->next; + do { + tmp = list_entry(entry, struct page, list); + if (tmp->index == order && memclass(page_zone(tmp), classzone)) { + list_del(entry); + current->nr_local_pages--; + set_page_count(tmp, 1); + page = tmp; + + if (page->buffers) + BUG(); + if (page->mapping) + BUG(); + if (!VALID_PAGE(page)) + BUG(); + if (PageLocked(page)) + BUG(); + if (PageLRU(page)) + BUG(); + if (PageActive(page)) + BUG(); + if (PageDirty(page)) + BUG(); + + break; + } + } while ((entry = entry->next) != local_pages); + } + + nr_pages = current->nr_local_pages; + /* free in reverse order so that the global order will be lifo */ + while ((entry = local_pages->prev) != local_pages) { + list_del(entry); + tmp = list_entry(entry, struct page, list); + __free_pages_ok(tmp, tmp->index); + if (!nr_pages--) + BUG(); + } + current->nr_local_pages = 0; + } + + *freed = __freed; + return page; +} + +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) +{ + long free = zone->free_pages - (1UL << order); + return free >= 0 ? free : 0; +} + +/* + * This is the 'heart' of the zoned buddy allocator: + */ +struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) +{ + zone_t **zone, * classzone; + struct page * page; + int freed, class_idx; + + zone = zonelist->zones; + classzone = *zone; + class_idx = zone_idx(classzone); + + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { + page = rmqueue(z, order); + if (page) + return page; + } + } + + classzone->need_balance = 1; + mb(); + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); + + zone = zonelist->zones; + for (;;) { + unsigned long min; + zone_t *z = *(zone++); + if (!z) + break; + + min = z->watermarks[class_idx].min; + if (!(gfp_mask & __GFP_WAIT)) + min >>= 2; + if (zone_free_pages(z, order) > min) { + page = rmqueue(z, order); + if (page) + return page; + } + } + + /* here we're in the low on memory slow path */ + + if ((current->flags & PF_MEMALLOC) && + (!in_interrupt() || (current->flags & PF_MEMDIE))) { + zone = zonelist->zones; + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + page = rmqueue(z, order); + if (page) + return page; + } + return NULL; + } + + /* Atomic allocations - we can't balance anything */ + if (!(gfp_mask & __GFP_WAIT)) + goto out; + + rebalance: + page = balance_classzone(classzone, gfp_mask, order, &freed); + if (page) + return page; + + zone = zonelist->zones; + if (likely(freed)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { + page = rmqueue(z, order); + if (page) + return page; + } + } + goto rebalance; + } else { + /* + * Check that no other task is been killed meanwhile, + * in such a case we can succeed the allocation. + */ + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { + page = rmqueue(z, order); + if (page) + return page; + } + } + } + + out: + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); + if (unlikely(vm_gfp_debug)) + dump_stack(); + return NULL; +} + +/* + * Common helper functions. + */ +fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) +{ + struct page * page; + + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + +fastcall unsigned long get_zeroed_page(unsigned int gfp_mask) +{ + struct page * page; + + page = alloc_pages(gfp_mask, 0); + if (page) { + void *address = page_address(page); + clear_page(address); + return (unsigned long) address; + } + return 0; +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __free_pages_ok(page, order); +} + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) + __free_pages(virt_to_page(addr), order); +} + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages (void) +{ + unsigned int sum = 0; + zone_t *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +/* + * Amount of free RAM allocatable as buffer memory: + */ +unsigned int nr_free_buffer_pages (void) +{ + pg_data_t *pgdat; + unsigned int sum = 0; + zonelist_t *zonelist; + zone_t **zonep, *zone; + + for_each_pgdat(pgdat) { + int class_idx; + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); + zonep = zonelist->zones; + zone = *zonep; + class_idx = zone_idx(zone); + + sum += zone->nr_cache_pages; + for (; zone; zone = *zonep++) { + int free = zone->free_pages - zone->watermarks[class_idx].high; + if (free <= 0) + continue; + sum += free; + } + } + + return sum; +} + +#if CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} + +unsigned int freeable_lowmem(void) +{ + unsigned int pages = 0; + pg_data_t *pgdat; + + for_each_pgdat(pgdat) { + pages += pgdat->node_zones[ZONE_DMA].free_pages; + pages += pgdat->node_zones[ZONE_DMA].nr_active_pages; + pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages; + pages += pgdat->node_zones[ZONE_NORMAL].free_pages; + pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages; + pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages; + } + + return pages; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas_core(pg_data_t *pgdat) +{ + unsigned int order; + unsigned type; + pg_data_t *tmpdat = pgdat; + + printk("Free pages: %6dkB (%6dkB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + while (tmpdat) { + zone_t *zone; + for (zone = tmpdat->node_zones; + zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) + printk("Zone:%s freepages:%6lukB\n", + zone->name, + K(zone->free_pages)); + + tmpdat = tmpdat->node_next; + } + + printk("( Active: %d, inactive: %d, free: %d )\n", + nr_active_pages, + nr_inactive_pages, + nr_free_pages()); + + for (type = 0; type < MAX_NR_ZONES; type++) { + struct list_head *head, *curr; + zone_t *zone = pgdat->node_zones + type; + unsigned long nr, total, flags; + + total = 0; + if (zone->size) { + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + head = &(zone->free_area + order)->free_list; + curr = head; + nr = 0; + for (;;) { + if ((curr = curr->next) == head) + break; + nr++; + } + total += nr * (1 << order); + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + } + printk("= %lukB)\n", K(total)); + } + +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +void show_free_areas(void) +{ + show_free_areas_core(pgdat_list); +} + +/* + * Builds allocation fallback zone lists. + */ +static inline void build_zonelists(pg_data_t *pgdat) +{ + int i, j, k; + + for (i = 0; i <= GFP_ZONEMASK; i++) { + zonelist_t *zonelist; + zone_t *zone; + + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + + j = 0; + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + switch (k) { + default: + BUG(); + /* + * fallthrough: + */ + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->size) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->size) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->size) + zonelist->zones[j++] = zone; + } + zonelist->zones[j++] = NULL; + } +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return size; +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, + unsigned long *zones_size, unsigned long zone_start_paddr, + unsigned long *zholes_size, struct page *lmem_map) +{ + unsigned long i, j; + unsigned long map_size; + unsigned long totalpages, offset, realtotalpages; + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + + if (zone_start_paddr & ~PAGE_MASK) + BUG(); + + totalpages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long size = zones_size[i]; + totalpages += size; + } + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + + printk("On node %d totalpages: %lu\n", nid, realtotalpages); + + /* + * Some architectures (with lots of mem and discontinous memory + * maps) have to search for a good mem_map area: + * For discontigmem, the conceptual mem map array starts from + * PAGE_OFFSET, we need to align the actual array onto a mem map + * boundary, so that MAP_NR works. + */ + map_size = (totalpages + 1)*sizeof(struct page); + if (lmem_map == (struct page *)0) { + lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); + lmem_map = (struct page *)(PAGE_OFFSET + + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); + } + *gmap = pgdat->node_mem_map = lmem_map; + pgdat->node_size = totalpages; + pgdat->node_start_paddr = zone_start_paddr; + pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->nr_zones = 0; + + offset = lmem_map - mem_map; + for (j = 0; j < MAX_NR_ZONES; j++) { + zone_t *zone = pgdat->node_zones + j; + unsigned long mask; + unsigned long size, realsize; + int idx; + + zone_table[nid * MAX_NR_ZONES + j] = zone; + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + printk("zone(%lu): %lu pages.\n", j, size); + zone->size = size; + zone->realsize = realsize; + zone->name = zone_names[j]; + zone->lock = SPIN_LOCK_UNLOCKED; + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + zone->need_balance = 0; + zone->nr_active_pages = zone->nr_inactive_pages = 0; + + + if (!size) + continue; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_shift = + BITS_PER_LONG - wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + mask = (realsize / zone_balance_ratio[j]); + if (mask < zone_balance_min[j]) + mask = zone_balance_min[j]; + else if (mask > zone_balance_max[j]) + mask = zone_balance_max[j]; + zone->watermarks[j].min = mask; + zone->watermarks[j].low = mask*2; + zone->watermarks[j].high = mask*3; + /* now set the watermarks of the lower zones in the "j" classzone */ + for (idx = j-1; idx >= 0; idx--) { + zone_t * lower_zone = pgdat->node_zones + idx; + unsigned long lower_zone_reserve; + if (!lower_zone->size) + continue; + + mask = lower_zone->watermarks[idx].min; + lower_zone->watermarks[j].min = mask; + lower_zone->watermarks[j].low = mask*2; + lower_zone->watermarks[j].high = mask*3; + + /* now the brainer part */ + lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; + lower_zone->watermarks[j].min += lower_zone_reserve; + lower_zone->watermarks[j].low += lower_zone_reserve; + lower_zone->watermarks[j].high += lower_zone_reserve; + + realsize += lower_zone->realsize; + } + + zone->zone_mem_map = mem_map + offset; + zone->zone_start_mapnr = offset; + zone->zone_start_paddr = zone_start_paddr; + + if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) + printk("BUG: wrong zone alignment, it will crash\n"); + + /* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ + for (i = 0; i < size; i++) { + struct page *page = mem_map + offset + i; + set_page_zone(page, nid * MAX_NR_ZONES + j); + set_page_count(page, 0); + SetPageReserved(page); + INIT_LIST_HEAD(&page->list); + if (j != ZONE_HIGHMEM) + set_page_address(page, __va(zone_start_paddr)); + zone_start_paddr += PAGE_SIZE; + } + + offset += size; + for (i = 0; ; i++) { + unsigned long bitmap_size; + + INIT_LIST_HEAD(&zone->free_area[i].free_list); + if (i == MAX_ORDER-1) { + zone->free_area[i].map = NULL; + break; + } + + /* + * Page buddy system uses "index >> (i+1)", + * where "index" is at most "size-1". + * + * The extra "+3" is to round down to byte + * size (8 bits per byte assumption). Thus + * we get "(size-1) >> (i+4)" as the last byte + * we can access. + * + * The "+1" is because we want to round the + * byte allocation up rather than down. So + * we should have had a "+7" before we shifted + * down by three. Also, we have to add one as + * we actually _use_ the last bit (it's [0,n] + * inclusive, not [0,n[). + * + * So we actually had +7+1 before we shift + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 + * (modulo overflows, which we do not have). + * + * Finally, we LONG_ALIGN because all bitmap + * operations are on longs. + */ + bitmap_size = (size-1) >> (i+4); + bitmap_size = LONG_ALIGN(bitmap_size+1); + zone->free_area[i].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } + } + build_zonelists(pgdat); +} + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); +} + +static int __init setup_mem_frac(char *str) +{ + int j = 0; + + while (get_option(&str, &zone_balance_ratio[j++]) == 2); + printk("setup_mem_frac: "); + for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); + printk("\n"); + return 1; +} + +__setup("memfrac=", setup_mem_frac); + +static int __init setup_lower_zone_reserve(char *str) +{ + int j = 0; + + while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); + printk("setup_lower_zone_reserve: "); + for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); + printk("\n"); + return 1; +} + +__setup("lower_zone_reserve=", setup_lower_zone_reserve); diff --git a/uClinux-2.4.31-uc0/mm/page_io.c b/uClinux-2.4.31-uc0/mm/page_io.c new file mode 100644 index 0000000..01d668b --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/page_io.c @@ -0,0 +1,120 @@ +/* + * linux/mm/page_io.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, + * Asynchronous swapping added 30.12.95. Stephen Tweedie + * Removed race in async swapping. 14.4.1996. Bruno Haible + * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie + * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman + */ + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/locks.h> +#include <linux/swapctl.h> + +#include <asm/pgtable.h> + +/* + * Reads or writes a swap page. + * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O. + * + * Important prevention of race condition: the caller *must* atomically + * create a unique swap cache entry for this swap page before calling + * rw_swap_page, and must lock that page. By ensuring that there is a + * single page of memory reserved for the swap entry, the normal VM page + * lock on that page also doubles as a lock on swap entries. Having only + * one lock to deal with per swap entry (rather than locking swap and memory + * independently) also makes it easier to make certain swapping operations + * atomic, which is particularly important when we are trying to ensure + * that shared pages stay shared while being swapped. + */ + +static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page) +{ + unsigned long offset; + int zones[PAGE_SIZE/512]; + int zones_used; + kdev_t dev = 0; + int block_size; + struct inode *swapf = 0; + + if (rw == READ) { + ClearPageUptodate(page); + kstat.pswpin++; + } else + kstat.pswpout++; + + get_swaphandle_info(entry, &offset, &dev, &swapf); + if (dev) { + zones[0] = offset; + zones_used = 1; + block_size = PAGE_SIZE; + } else if (swapf) { + int i, j; + unsigned int block = offset + << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); + + block_size = swapf->i_sb->s_blocksize; + for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return 0; + } + zones_used = i; + dev = swapf->i_dev; + } else { + return 0; + } + + /* block_size == PAGE_SIZE/zones_used */ + brw_page(rw, page, dev, zones, block_size); + return 1; +} + +/* + * A simple wrapper so the base function doesn't need to enforce + * that all swap pages go through the swap cache! We verify that: + * - the page is locked + * - it's marked as being swap-cache + * - it's associated with the swap inode + */ +void rw_swap_page(int rw, struct page *page) +{ + swp_entry_t entry; + + entry.val = page->index; + + if (!PageLocked(page)) + PAGE_BUG(page); + if (!PageSwapCache(page)) + PAGE_BUG(page); + if (!rw_swap_page_base(rw, entry, page)) + UnlockPage(page); +} + +/* + * The swap lock map insists that pages be in the page cache! + * Therefore we can't use it. Later when we can remove the need for the + * lock map and we can reduce the number of functions exported. + */ +void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf) +{ + struct page *page = virt_to_page(buf); + + if (!PageLocked(page)) + PAGE_BUG(page); + if (page->mapping) + PAGE_BUG(page); + /* needs sync_page to wait I/O completation */ + page->mapping = &swapper_space; + if (rw_swap_page_base(rw, entry, page)) + lock_page(page); + if (!block_flushpage(page, 0)) + PAGE_BUG(page); + page->mapping = NULL; + UnlockPage(page); +} diff --git a/uClinux-2.4.31-uc0/mm/shmem.c b/uClinux-2.4.31-uc0/mm/shmem.c new file mode 100644 index 0000000..9ad81bd --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/shmem.c @@ -0,0 +1,1753 @@ +/* + * Resizable virtual memory filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG + * 2002 Red Hat Inc. + * Copyright (C) 2002-2003 Hugh Dickins. + * Copyright (C) 2002-2003 VERITAS Software Corporation. + * + * This file is released under the GPL. + */ + +/* + * This virtual memory filesystem is heavily based on the ramfs. It + * extends ramfs by the ability to use swap and honor resource limits + * which makes it a completely usable filesystem. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/devfs_fs_kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/string.h> +#include <linux/locks.h> +#include <linux/smp_lock.h> + +#include <asm/uaccess.h> +#include <asm/div64.h> + +/* This magic number is used in glibc for posix shared memory */ +#define TMPFS_MAGIC 0x01021994 + +#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) +#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) + +#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) +#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) + +#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) + +/* info->flags needs VM_flags to handle pagein/truncate race efficiently */ +#define SHMEM_PAGEIN VM_READ +#define SHMEM_TRUNCATE VM_WRITE + +/* Pretend that each entry is of this size in directory's i_size */ +#define BOGO_DIRENT_SIZE 20 + +#define SHMEM_SB(sb) (&sb->u.shmem_sb) + +/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +enum sgp_type { + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate page */ +}; + +static int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp); + +static struct super_operations shmem_ops; +static struct address_space_operations shmem_aops; +static struct file_operations shmem_file_operations; +static struct inode_operations shmem_inode_operations; +static struct inode_operations shmem_dir_inode_operations; +static struct vm_operations_struct shmem_vm_ops; + +LIST_HEAD(shmem_inodes); +static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED; + +static void shmem_free_block(struct inode *inode) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks++; + inode->i_blocks -= BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); +} + +static void shmem_removepage(struct page *page) +{ + if (!PageLaunder(page) && !PageError(page)) + shmem_free_block(page->mapping->host); +} + +/* + * shmem_swp_entry - find the swap vector position in the info structure + * + * @info: info structure for the inode + * @index: index of the page to find + * @page: optional page to add to the structure. Has to be preset to + * all zeros + * + * If there is no space allocated yet it will return NULL when + * page is 0, else it will use the page for the needed block, + * setting it to 0 on return to indicate that it has been used. + * + * The swap vector is organized the following way: + * + * There are SHMEM_NR_DIRECT entries directly stored in the + * shmem_inode_info structure. So small files do not need an addional + * allocation. + * + * For pages with index > SHMEM_NR_DIRECT there is the pointer + * i_indirect which points to a page which holds in the first half + * doubly indirect blocks, in the second half triple indirect blocks: + * + * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the + * following layout (for SHMEM_NR_DIRECT == 16): + * + * i_indirect -> dir --> 16-19 + * | +-> 20-23 + * | + * +-->dir2 --> 24-27 + * | +-> 28-31 + * | +-> 32-35 + * | +-> 36-39 + * | + * +-->dir3 --> 40-43 + * +-> 44-47 + * +-> 48-51 + * +-> 52-55 + */ +static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page) +{ + unsigned long offset; + void **dir; + + if (index < SHMEM_NR_DIRECT) + return info->i_direct+index; + if (!info->i_indirect) { + if (page) { + info->i_indirect = (void **) *page; + *page = 0; + } + return NULL; /* need another page */ + } + + index -= SHMEM_NR_DIRECT; + offset = index % ENTRIES_PER_PAGE; + index /= ENTRIES_PER_PAGE; + dir = info->i_indirect; + + if (index >= ENTRIES_PER_PAGE/2) { + index -= ENTRIES_PER_PAGE/2; + dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; + index %= ENTRIES_PER_PAGE; + if (!*dir) { + if (page) { + *dir = (void *) *page; + *page = 0; + } + return NULL; /* need another page */ + } + dir = (void **) *dir; + } + + dir += index; + if (!*dir) { + if (!page || !*page) + return NULL; /* need a page */ + *dir = (void *) *page; + *page = 0; + } + return (swp_entry_t *) *dir + offset; +} + +/* + * shmem_swp_alloc - get the position of the swap entry for the page. + * If it does not exist allocate the entry. + * + * @info: info structure for the inode + * @index: index of the page to find + * @sgp: check and recheck i_size? skip allocation? + */ +static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) +{ + struct inode *inode = info->inode; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long page = 0; + swp_entry_t *entry; + static const swp_entry_t unswapped = {0}; + + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) + return ERR_PTR(-EINVAL); + + while (!(entry = shmem_swp_entry(info, index, &page))) { + if (sgp == SGP_READ) + return (swp_entry_t *) &unswapped; + /* + * Test free_blocks against 1 not 0, since we have 1 data + * page (and perhaps indirect index pages) yet to allocate: + * a waste to allocate index if we cannot allocate data. + */ + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks <= 1) { + spin_unlock(&sbinfo->stat_lock); + return ERR_PTR(-ENOSPC); + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + + spin_unlock(&info->lock); + page = get_zeroed_page(GFP_USER); + spin_lock(&info->lock); + + if (!page) { + shmem_free_block(inode); + return ERR_PTR(-ENOMEM); + } + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) { + entry = ERR_PTR(-EINVAL); + break; + } + if (info->next_index <= index) + info->next_index = index + 1; + } + if (page) { + /* another task gave its page, or truncated the file */ + shmem_free_block(inode); + free_page(page); + } + if (info->next_index <= index && !IS_ERR(entry)) + info->next_index = index + 1; + return entry; +} + +/* + * shmem_free_swp - free some swap entries in a directory + * + * @dir: pointer to the directory + * @edir: pointer after last entry of the directory + */ +static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir) +{ + swp_entry_t *ptr; + int freed = 0; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val) { + free_swap_and_cache(*ptr); + *ptr = (swp_entry_t){0}; + freed++; + } + } + return freed; +} + +/* + * shmem_truncate_direct - free the swap entries of a whole doubly + * indirect block + * + * @info: the info structure of the inode + * @dir: pointer to the pointer to the block + * @start: offset to start from (in pages) + * @len: how many pages are stored in this block + */ +static inline unsigned long +shmem_truncate_direct(struct shmem_inode_info *info, swp_entry_t ***dir, unsigned long start, unsigned long len) +{ + swp_entry_t **last, **ptr; + unsigned long off, freed_swp, freed = 0; + + last = *dir + (len + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE; + off = start % ENTRIES_PER_PAGE; + + for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) { + if (!*ptr) + continue; + + if (info->swapped) { + freed_swp = shmem_free_swp(*ptr + off, + *ptr + ENTRIES_PER_PAGE); + info->swapped -= freed_swp; + freed += freed_swp; + } + + if (!off) { + freed++; + free_page((unsigned long) *ptr); + *ptr = 0; + } + } + + if (!start) { + freed++; + free_page((unsigned long) *dir); + *dir = 0; + } + return freed; +} + +/* + * shmem_truncate_indirect - truncate an inode + * + * @info: the info structure of the inode + * @index: the index to truncate + * + * This function locates the last doubly indirect block and calls + * then shmem_truncate_direct to do the real work + */ +static inline unsigned long +shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index) +{ + swp_entry_t ***base; + unsigned long baseidx, start; + unsigned long len = info->next_index; + unsigned long freed; + + if (len <= SHMEM_NR_DIRECT) { + info->next_index = index; + if (!info->swapped) + return 0; + freed = shmem_free_swp(info->i_direct + index, + info->i_direct + len); + info->swapped -= freed; + return freed; + } + + if (len <= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT) { + len -= SHMEM_NR_DIRECT; + base = (swp_entry_t ***) &info->i_indirect; + baseidx = SHMEM_NR_DIRECT; + } else { + len -= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT; + BUG_ON(len > ENTRIES_PER_PAGEPAGE*ENTRIES_PER_PAGE/2); + baseidx = len - 1; + baseidx -= baseidx % ENTRIES_PER_PAGEPAGE; + base = (swp_entry_t ***) info->i_indirect + + ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGEPAGE; + len -= baseidx; + baseidx += ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT; + } + + if (index > baseidx) { + info->next_index = index; + start = index - baseidx; + } else { + info->next_index = baseidx; + start = 0; + } + return *base? shmem_truncate_direct(info, base, start, len): 0; +} + +static void shmem_truncate(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long freed = 0; + unsigned long index; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (index >= info->next_index) + return; + + spin_lock(&info->lock); + while (index < info->next_index) + freed += shmem_truncate_indirect(info, index); + BUG_ON(info->swapped > info->next_index); + + if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { + /* + * Call truncate_inode_pages again: racing shmem_unuse_inode + * may have swizzled a page in from swap since vmtruncate or + * generic_delete_inode did it, before we lowered next_index. + * Also, though shmem_getpage checks i_size before adding to + * cache, no recheck after: so fix the narrow window there too. + */ + info->flags |= SHMEM_TRUNCATE; + spin_unlock(&info->lock); + truncate_inode_pages(inode->i_mapping, inode->i_size); + spin_lock(&info->lock); + info->flags &= ~SHMEM_TRUNCATE; + } + + spin_unlock(&info->lock); + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks += freed; + inode->i_blocks -= freed*BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); +} + +static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct page *page = NULL; + int error; + + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size < inode->i_size) { + /* + * If truncating down to a partial page, then + * if that page is already allocated, hold it + * in memory until the truncation is over, so + * truncate_partial_page cannnot miss it were + * it assigned to swap. + */ + if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + (void) shmem_getpage(inode, + attr->ia_size>>PAGE_CACHE_SHIFT, + &page, SGP_READ); + } + /* + * Reset SHMEM_PAGEIN flag so that shmem_truncate can + * detect if any pages might have been added to cache + * after truncate_inode_pages. But we needn't bother + * if it's being fully truncated to zero-length: the + * nrpages check is efficient enough in that case. + */ + if (attr->ia_size) { + struct shmem_inode_info *info = SHMEM_I(inode); + spin_lock(&info->lock); + info->flags &= ~SHMEM_PAGEIN; + spin_unlock(&info->lock); + } + } + } + + error = inode_change_ok(inode, attr); + if (!error) + error = inode_setattr(inode, attr); + if (page) + page_cache_release(page); + return error; +} + +static void shmem_delete_inode(struct inode *inode) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (inode->i_op->truncate == shmem_truncate) { + spin_lock(&shmem_ilock); + list_del(&info->list); + spin_unlock(&shmem_ilock); + inode->i_size = 0; + shmem_truncate(inode); + } + BUG_ON(inode->i_blocks); + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + clear_inode(inode); +} + +static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) +{ + swp_entry_t *ptr; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val == entry.val) + return ptr - dir; + } + return -1; +} + +static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +{ + struct inode *inode; + struct address_space *mapping; + swp_entry_t *ptr; + unsigned long idx; + int offset; + + idx = 0; + ptr = info->i_direct; + spin_lock(&info->lock); + offset = info->next_index; + if (offset > SHMEM_NR_DIRECT) + offset = SHMEM_NR_DIRECT; + offset = shmem_find_swp(entry, ptr, ptr + offset); + if (offset >= 0) + goto found; + + for (idx = SHMEM_NR_DIRECT; idx < info->next_index; + idx += ENTRIES_PER_PAGE) { + ptr = shmem_swp_entry(info, idx, NULL); + if (!ptr) + continue; + offset = info->next_index - idx; + if (offset > ENTRIES_PER_PAGE) + offset = ENTRIES_PER_PAGE; + offset = shmem_find_swp(entry, ptr, ptr + offset); + if (offset >= 0) + goto found; + } + spin_unlock(&info->lock); + return 0; +found: + idx += offset; + inode = info->inode; + mapping = inode->i_mapping; + delete_from_swap_cache(page); + if (add_to_page_cache_unique(page, + mapping, idx, page_hash(mapping, idx)) == 0) { + info->flags |= SHMEM_PAGEIN; + ptr[offset].val = 0; + info->swapped--; + } else if (add_to_swap_cache(page, entry) != 0) + BUG(); + spin_unlock(&info->lock); + SetPageUptodate(page); + /* + * Decrement swap count even when the entry is left behind: + * try_to_unuse will skip over mms, then reincrement count. + */ + swap_free(entry); + return 1; +} + +/* + * shmem_unuse() search for an eventually swapped out shmem page. + */ +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + struct list_head *p; + struct shmem_inode_info *info; + int found = 0; + + spin_lock(&shmem_ilock); + list_for_each(p, &shmem_inodes) { + info = list_entry(p, struct shmem_inode_info, list); + + if (info->swapped && shmem_unuse_inode(info, entry, page)) { + /* move head to start search for next from here */ + list_move_tail(&shmem_inodes, &info->list); + found = 1; + break; + } + } + spin_unlock(&shmem_ilock); + return found; +} + +/* + * Move the page from the page cache to the swap cache. + */ +static int shmem_writepage(struct page *page) +{ + struct shmem_inode_info *info; + swp_entry_t *entry, swap; + struct address_space *mapping; + unsigned long index; + struct inode *inode; + + BUG_ON(!PageLocked(page)); + if (!PageLaunder(page)) + goto fail; + + mapping = page->mapping; + index = page->index; + inode = mapping->host; + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto fail; +getswap: + swap = get_swap_page(); + if (!swap.val) + goto fail; + + spin_lock(&info->lock); + if (index >= info->next_index) { + BUG_ON(!(info->flags & SHMEM_TRUNCATE)); + spin_unlock(&info->lock); + swap_free(swap); + goto fail; + } + entry = shmem_swp_entry(info, index, NULL); + BUG_ON(!entry); + BUG_ON(entry->val); + + /* Remove it from the page cache */ + remove_inode_page(page); + page_cache_release(page); + + /* Add it to the swap cache */ + if (add_to_swap_cache(page, swap) != 0) { + /* + * Raced with "speculative" read_swap_cache_async. + * Add page back to page cache, unref swap, try again. + */ + add_to_page_cache_locked(page, mapping, index); + info->flags |= SHMEM_PAGEIN; + spin_unlock(&info->lock); + swap_free(swap); + goto getswap; + } + + *entry = swap; + info->swapped++; + spin_unlock(&info->lock); + SetPageUptodate(page); + set_page_dirty(page); + UnlockPage(page); + return 0; +fail: + return fail_writepage(page); +} + +/* + * shmem_getpage - either get the page from swap or allocate a new one + * + * If we allocate a new one we do not mark it dirty. That's up to the + * vm. If we swap it in we mark it dirty since we also free the swap + * entry since a page cannot live in both the swap and page cache + */ +static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo; + struct page *filepage = *pagep; + struct page *swappage; + swp_entry_t *entry; + swp_entry_t swap; + int error = 0; + + if (idx >= SHMEM_MAX_INDEX) { + error = -EFBIG; + goto failed; + } + + /* + * Normally, filepage is NULL on entry, and either found + * uptodate immediately, or allocated and zeroed, or read + * in under swappage, which is then assigned to filepage. + * But shmem_readpage and shmem_prepare_write pass in a locked + * filepage, which may be found not uptodate by other callers + * too, and may need to be copied from the swappage read in. + */ +repeat: + if (!filepage) + filepage = find_lock_page(mapping, idx); + if (filepage && Page_Uptodate(filepage)) + goto done; + + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) { + spin_unlock(&info->lock); + error = PTR_ERR(entry); + goto failed; + } + swap = *entry; + + if (swap.val) { + /* Look it up and read it in.. */ + swappage = lookup_swap_cache(swap); + if (!swappage) { + spin_unlock(&info->lock); + swapin_readahead(swap); + swappage = read_swap_cache_async(swap); + if (!swappage) { + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else if (entry->val == swap.val) + error = -ENOMEM; + spin_unlock(&info->lock); + if (error) + goto failed; + goto repeat; + } + wait_on_page(swappage); + page_cache_release(swappage); + goto repeat; + } + + /* We have to do this with page locked to prevent races */ + if (TryLockPage(swappage)) { + spin_unlock(&info->lock); + wait_on_page(swappage); + page_cache_release(swappage); + goto repeat; + } + if (!Page_Uptodate(swappage)) { + spin_unlock(&info->lock); + UnlockPage(swappage); + page_cache_release(swappage); + error = -EIO; + goto failed; + } + + delete_from_swap_cache(swappage); + if (filepage) { + entry->val = 0; + info->swapped--; + spin_unlock(&info->lock); + flush_page_to_ram(swappage); + copy_highpage(filepage, swappage); + UnlockPage(swappage); + page_cache_release(swappage); + flush_dcache_page(filepage); + SetPageUptodate(filepage); + SetPageDirty(filepage); + swap_free(swap); + } else if (add_to_page_cache_unique(swappage, + mapping, idx, page_hash(mapping, idx)) == 0) { + info->flags |= SHMEM_PAGEIN; + entry->val = 0; + info->swapped--; + spin_unlock(&info->lock); + filepage = swappage; + SetPageUptodate(filepage); + SetPageDirty(filepage); + swap_free(swap); + } else { + if (add_to_swap_cache(swappage, swap) != 0) + BUG(); + spin_unlock(&info->lock); + SetPageUptodate(swappage); + SetPageDirty(swappage); + UnlockPage(swappage); + page_cache_release(swappage); + goto repeat; + } + } else if (sgp == SGP_READ && !filepage) { + filepage = find_get_page(mapping, idx); + if (filepage && + (!Page_Uptodate(filepage) || TryLockPage(filepage))) { + spin_unlock(&info->lock); + wait_on_page(filepage); + page_cache_release(filepage); + filepage = NULL; + goto repeat; + } + spin_unlock(&info->lock); + } else { + sbinfo = SHMEM_SB(inode->i_sb); + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks == 0) { + spin_unlock(&sbinfo->stat_lock); + spin_unlock(&info->lock); + error = -ENOSPC; + goto failed; + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + + if (!filepage) { + spin_unlock(&info->lock); + filepage = page_cache_alloc(mapping); + if (!filepage) { + shmem_free_block(inode); + error = -ENOMEM; + goto failed; + } + + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + if (error || entry->val || + add_to_page_cache_unique(filepage, + mapping, idx, page_hash(mapping, idx)) != 0) { + spin_unlock(&info->lock); + page_cache_release(filepage); + shmem_free_block(inode); + filepage = NULL; + if (error) + goto failed; + goto repeat; + } + info->flags |= SHMEM_PAGEIN; + } + + spin_unlock(&info->lock); + clear_highpage(filepage); + flush_dcache_page(filepage); + SetPageUptodate(filepage); + } +done: + if (!*pagep) { + if (filepage) + UnlockPage(filepage); + else + filepage = ZERO_PAGE(0); + *pagep = filepage; + } + if (PageError(filepage)) + ClearPageError(filepage); + return 0; + +failed: + if (filepage) { + if (*pagep == filepage) + SetPageError(filepage); + else { + UnlockPage(filepage); + page_cache_release(filepage); + } + } + return error; +} + +struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct page *page = NULL; + unsigned long idx; + int error; + + idx = (address - vma->vm_start) >> PAGE_SHIFT; + idx += vma->vm_pgoff; + idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + + error = shmem_getpage(inode, idx, &page, SGP_CACHE); + if (error) + return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; + + mark_page_accessed(page); + flush_page_to_ram(page); + return page; +} + +void shmem_lock(struct file *file, int lock) +{ + struct inode *inode = file->f_dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + + spin_lock(&info->lock); + if (lock) + info->flags |= VM_LOCKED; + else + info->flags &= ~VM_LOCKED; + spin_unlock(&info->lock); +} + +static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct vm_operations_struct *ops; + struct inode *inode = file->f_dentry->d_inode; + + ops = &shmem_vm_ops; + if (!S_ISREG(inode->i_mode)) + return -EACCES; + UPDATE_ATIME(inode); + vma->vm_ops = ops; + return 0; +} + +static struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) +{ + struct inode *inode; + struct shmem_inode_info *info; + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return NULL; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + + inode = new_inode(sb); + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_rdev = NODEV; + inode->i_mapping->a_ops = &shmem_aops; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = SHMEM_I(inode); + info->inode = inode; + spin_lock_init(&info->lock); + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + spin_lock(&shmem_ilock); + list_add_tail(&info->list, &shmem_inodes); + spin_unlock(&shmem_ilock); + break; + case S_IFDIR: + inode->i_nlink++; + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &dcache_dir_ops; + break; + case S_IFLNK: + break; + } + } + return inode; +} + +static int shmem_set_size(struct shmem_sb_info *info, + unsigned long max_blocks, unsigned long max_inodes) +{ + int error; + unsigned long blocks, inodes; + + spin_lock(&info->stat_lock); + blocks = info->max_blocks - info->free_blocks; + inodes = info->max_inodes - info->free_inodes; + error = -EINVAL; + if (max_blocks < blocks) + goto out; + if (max_inodes < inodes) + goto out; + error = 0; + info->max_blocks = max_blocks; + info->free_blocks = max_blocks - blocks; + info->max_inodes = max_inodes; + info->free_inodes = max_inodes - inodes; +out: + spin_unlock(&info->stat_lock); + return error; +} + +#ifdef CONFIG_TMPFS + +static struct inode_operations shmem_symlink_inode_operations; +static struct inode_operations shmem_symlink_inline_operations; + +/* + * tmpfs itself makes no use of generic_file_read, generic_file_mmap + * or generic_file_write; but shmem_readpage, shmem_prepare_write and + * shmem_commit_write let a tmpfs file be used below the loop driver, + * and shmem_readpage lets a tmpfs file be used by sendfile. + */ +static int +shmem_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error = shmem_getpage(inode, page->index, &page, SGP_CACHE); + UnlockPage(page); + return error; +} + +static int +shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) +{ + struct inode *inode = page->mapping->host; + return shmem_getpage(inode, page->index, &page, SGP_WRITE); +} + +static int +shmem_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + if (pos > inode->i_size) + inode->i_size = pos; + SetPageDirty(page); + return 0; +} + +static ssize_t +shmem_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + loff_t pos; + unsigned long written; + ssize_t err; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + down(&inode->i_sem); + + pos = *ppos; + written = 0; + + err = precheck_file_write(file, inode, &count, &pos); + if (err || !count) + goto out; + + remove_suid(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + do { + struct page *page = NULL; + unsigned long bytes, index, offset; + char *kaddr; + int left; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * We don't hold page lock across copy from user - + * what would it guard against? - so no deadlock here. + */ + + err = shmem_getpage(inode, index, &page, SGP_WRITE); + if (err) + break; + + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + + written += bytes; + count -= bytes; + pos += bytes; + buf += bytes; + if (pos > inode->i_size) + inode->i_size = pos; + + flush_dcache_page(page); + SetPageDirty(page); + SetPageReferenced(page); + page_cache_release(page); + + if (left) { + pos -= left; + written -= left; + err = -EFAULT; + break; + } + } while (count); + + *ppos = pos; + if (written) + err = written; +out: + up(&inode->i_sem); + return err; +} + +static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long index, offset; + loff_t pos = *ppos; + + if (unlikely(pos < 0)) + return; + + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page = NULL; + unsigned long end_index, nr, ret; + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + desc->error = shmem_getpage(inode, index, &page, SGP_READ); + if (desc->error) { + if (desc->error == -EINVAL) + desc->error = 0; + break; + } + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_sem protection against truncate + */ + nr = PAGE_CACHE_SIZE; + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) { + page_cache_release(page); + break; + } + } + nr -= offset; + + if (page != ZERO_PAGE(0)) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping->i_mmap_shared != NULL) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the + * beginning or we just did an lseek. + */ + if (!offset || !filp->f_reada) + mark_page_accessed(page); + } + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = file_read_actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret != nr || !desc->count) + break; + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + filp->f_reada = 1; + UPDATE_ATIME(inode); +} + +static ssize_t shmem_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) +{ + read_descriptor_t desc; + + if ((ssize_t) count < 0) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + if (!count) + return 0; + + desc.written = 0; + desc.count = count; + desc.buf = buf; + desc.error = 0; + + do_shmem_file_read(filp, ppos, &desc); + if (desc.written) + return desc.written; + return desc.error; +} + +static int shmem_statfs(struct super_block *sb, struct statfs *buf) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + buf->f_type = TMPFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + spin_lock(&sbinfo->stat_lock); + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + spin_unlock(&sbinfo->stat_lock); + buf->f_namelen = NAME_MAX; + return 0; +} + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +static int shmem_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +/* + * Lookup the data. This is trivial - if the dentry didn't already + * exist, we know it is negative. Set d_op to delete negative dentries. + */ +static struct dentry *shmem_lookup(struct inode *dir, struct dentry *dentry) +{ + static struct dentry_operations shmem_dentry_operations = { + .d_delete = shmem_delete_dentry, + }; + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + dentry->d_op = &shmem_dentry_operations; + d_add(dentry, NULL); + return NULL; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev) +{ + struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); + int error = -ENOSPC; + + if (inode) { + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + } + return error; +} + +static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int error; + + if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + return error; + dir->i_nlink++; + return 0; +} + +static int shmem_create(struct inode *dir, struct dentry *dentry, int mode) +{ + return shmem_mknod(dir, dentry, mode | S_IFREG, 0); +} + +/* + * Link a file.. + */ +static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + dir->i_size += BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink++; + atomic_inc(&inode->i_count); /* New dentry reference */ + dget(dentry); /* Extra pinning count for the created dentry */ + d_instantiate(dentry, inode); + return 0; +} + +static inline int shmem_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +/* + * Check that a directory is empty (this works + * for regular files too, they'll just always be + * considered empty..). + * + * Note that an empty directory can still have + * children, they just all have to be negative.. + */ +static int shmem_empty(struct dentry *dentry) +{ + struct list_head *list; + + spin_lock(&dcache_lock); + list = dentry->d_subdirs.next; + + while (list != &dentry->d_subdirs) { + struct dentry *de = list_entry(list, struct dentry, d_child); + + if (shmem_positive(de)) { + spin_unlock(&dcache_lock); + return 0; + } + list = list->next; + } + spin_unlock(&dcache_lock); + return 1; +} + +static int shmem_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + dir->i_size -= BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink--; + dput(dentry); /* Undo the count from "create" - this does all the work */ + return 0; +} + +static int shmem_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!shmem_empty(dentry)) + return -ENOTEMPTY; + + dir->i_nlink--; + return shmem_unlink(dir, dentry); +} + +/* + * The VFS layer already does all the dentry stuff for rename, + * we just have to decrement the usage count for the target if + * it exists so that the VFS layer correctly free's it when it + * gets overwritten. + */ +static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *inode = old_dentry->d_inode; + int they_are_dirs = S_ISDIR(inode->i_mode); + + if (!shmem_empty(new_dentry)) + return -ENOTEMPTY; + + if (new_dentry->d_inode) { + (void) shmem_unlink(new_dir, new_dentry); + if (they_are_dirs) + old_dir->i_nlink--; + } else if (they_are_dirs) { + old_dir->i_nlink--; + new_dir->i_nlink++; + } + + old_dir->i_size -= BOGO_DIRENT_SIZE; + new_dir->i_size += BOGO_DIRENT_SIZE; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + inode->i_ctime = CURRENT_TIME; + return 0; +} + +static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int error; + int len; + struct inode *inode; + struct page *page = NULL; + char *kaddr; + struct shmem_inode_info *info; + + len = strlen(symname) + 1; + if (len > PAGE_CACHE_SIZE) + return -ENAMETOOLONG; + + inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + if (!inode) + return -ENOSPC; + + info = SHMEM_I(inode); + inode->i_size = len-1; + if (len <= sizeof(struct shmem_inode_info)) { + /* do it inline */ + memcpy(info, symname, len); + inode->i_op = &shmem_symlink_inline_operations; + } else { + error = shmem_getpage(inode, 0, &page, SGP_WRITE); + if (error) { + iput(inode); + return error; + } + inode->i_op = &shmem_symlink_inode_operations; + spin_lock(&shmem_ilock); + list_add_tail(&info->list, &shmem_inodes); + spin_unlock(&shmem_ilock); + kaddr = kmap(page); + memcpy(kaddr, symname, len); + kunmap(page); + SetPageDirty(page); + page_cache_release(page); + } + if (dir->i_mode & S_ISGID) + inode->i_gid = dir->i_gid; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); + return 0; +} + +static int shmem_readlink_inline(struct dentry *dentry, char *buffer, int buflen) +{ + return vfs_readlink(dentry, buffer, buflen, (const char *)SHMEM_I(dentry->d_inode)); +} + +static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +{ + return vfs_follow_link(nd, (const char *)SHMEM_I(dentry->d_inode)); +} + +static int shmem_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + struct page *page = NULL; + int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ); + if (res) + return res; + res = vfs_readlink(dentry, buffer, buflen, kmap(page)); + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); + return res; +} + +static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ); + if (res) + return res; + res = vfs_follow_link(nd, kmap(page)); + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); + return res; +} + +static struct inode_operations shmem_symlink_inline_operations = { + readlink: shmem_readlink_inline, + follow_link: shmem_follow_link_inline, +}; + +static struct inode_operations shmem_symlink_inode_operations = { + truncate: shmem_truncate, + readlink: shmem_readlink, + follow_link: shmem_follow_link, +}; + +static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) +{ + char *this_char, *value, *rest; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char,'=')) != NULL) { + *value++ = 0; + } else { + printk(KERN_ERR + "tmpfs: No value for mount option '%s'\n", + this_char); + return 1; + } + + if (!strcmp(this_char,"size")) { + unsigned long long size; + size = memparse(value,&rest); + if (*rest == '%') { + struct sysinfo si; + si_meminfo(&si); + size <<= PAGE_SHIFT; + size *= si.totalram; + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_val; + *blocks = size >> PAGE_CACHE_SHIFT; + } else if (!strcmp(this_char,"nr_blocks")) { + *blocks = memparse(value,&rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"nr_inodes")) { + *inodes = memparse(value,&rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"mode")) { + if (!mode) + continue; + *mode = simple_strtoul(value,&rest,8); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"uid")) { + if (!uid) + continue; + *uid = simple_strtoul(value,&rest,0); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"gid")) { + if (!gid) + continue; + *gid = simple_strtoul(value,&rest,0); + if (*rest) + goto bad_val; + } else { + printk(KERN_ERR "tmpfs: Bad mount option %s\n", + this_char); + return 1; + } + } + return 0; + +bad_val: + printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + value, this_char); + return 1; +} + +static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + unsigned long max_blocks = sbinfo->max_blocks; + unsigned long max_inodes = sbinfo->max_inodes; + + if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes)) + return -EINVAL; + return shmem_set_size(sbinfo, max_blocks, max_inodes); +} + +static int shmem_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + return 0; +} +#endif + +static struct super_block *shmem_read_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + unsigned long blocks, inodes; + int mode = S_IRWXUGO | S_ISVTX; + uid_t uid = current->fsuid; + gid_t gid = current->fsgid; + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + struct sysinfo si; + + /* + * Per default we only allow half of the physical ram per + * tmpfs instance + */ + si_meminfo(&si); + blocks = inodes = si.totalram / 2; + +#ifdef CONFIG_TMPFS + if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, &inodes)) + return NULL; +#endif + + spin_lock_init(&sbinfo->stat_lock); + sbinfo->max_blocks = blocks; + sbinfo->free_blocks = blocks; + sbinfo->max_inodes = inodes; + sbinfo->free_inodes = inodes; + sb->s_maxbytes = SHMEM_MAX_BYTES; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = TMPFS_MAGIC; + sb->s_op = &shmem_ops; + inode = shmem_get_inode(sb, S_IFDIR | mode, 0); + if (!inode) + return NULL; + + inode->i_uid = uid; + inode->i_gid = gid; + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return NULL; + } + sb->s_root = root; + return sb; +} + +static struct address_space_operations shmem_aops = { + removepage: shmem_removepage, + writepage: shmem_writepage, +#ifdef CONFIG_TMPFS + readpage: shmem_readpage, + prepare_write: shmem_prepare_write, + commit_write: shmem_commit_write, +#endif +}; + +static struct file_operations shmem_file_operations = { + mmap: shmem_mmap, +#ifdef CONFIG_TMPFS + read: shmem_file_read, + write: shmem_file_write, + fsync: shmem_sync_file, +#endif +}; + +static struct inode_operations shmem_inode_operations = { + truncate: shmem_truncate, + setattr: shmem_notify_change, +}; + +static struct inode_operations shmem_dir_inode_operations = { +#ifdef CONFIG_TMPFS + create: shmem_create, + lookup: shmem_lookup, + link: shmem_link, + unlink: shmem_unlink, + symlink: shmem_symlink, + mkdir: shmem_mkdir, + rmdir: shmem_rmdir, + mknod: shmem_mknod, + rename: shmem_rename, +#endif +}; + +static struct super_operations shmem_ops = { +#ifdef CONFIG_TMPFS + statfs: shmem_statfs, + remount_fs: shmem_remount_fs, +#endif + delete_inode: shmem_delete_inode, + put_inode: force_delete, +}; + +static struct vm_operations_struct shmem_vm_ops = { + nopage: shmem_nopage, +}; + +#ifdef CONFIG_TMPFS +/* type "shm" will be tagged obsolete in 2.5 */ +static DECLARE_FSTYPE(shmem_fs_type, "shm", shmem_read_super, FS_LITTER); +static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs", shmem_read_super, FS_LITTER); +#else +static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs", shmem_read_super, FS_LITTER|FS_NOMOUNT); +#endif +static struct vfsmount *shm_mnt; + +static int __init init_tmpfs(void) +{ + int error; + + error = register_filesystem(&tmpfs_fs_type); + if (error) { + printk(KERN_ERR "Could not register tmpfs\n"); + goto out3; + } +#ifdef CONFIG_TMPFS + error = register_filesystem(&shmem_fs_type); + if (error) { + printk(KERN_ERR "Could not register shm fs\n"); + goto out2; + } + devfs_mk_dir(NULL, "shm", NULL); +#endif + shm_mnt = kern_mount(&tmpfs_fs_type); + if (IS_ERR(shm_mnt)) { + error = PTR_ERR(shm_mnt); + printk(KERN_ERR "Could not kern_mount tmpfs\n"); + goto out1; + } + + /* The internal instance should not do size checking */ + shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX); + return 0; + +out1: +#ifdef CONFIG_TMPFS + unregister_filesystem(&shmem_fs_type); +out2: +#endif + unregister_filesystem(&tmpfs_fs_type); +out3: + shm_mnt = ERR_PTR(error); + return error; +} +module_init(init_tmpfs) + +/* + * shmem_file_setup - get an unlinked file living in tmpfs + * + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * + */ +struct file *shmem_file_setup(char *name, loff_t size) +{ + int error; + struct file *file; + struct inode *inode; + struct dentry *dentry, *root; + struct qstr this; + int vm_enough_memory(long pages); + + if (IS_ERR(shm_mnt)) + return (void *)shm_mnt; + + if (size > SHMEM_MAX_BYTES) + return ERR_PTR(-EINVAL); + + if (!vm_enough_memory(VM_ACCT(size))) + return ERR_PTR(-ENOMEM); + + this.name = name; + this.len = strlen(name); + this.hash = 0; /* will go */ + root = shm_mnt->mnt_root; + dentry = d_alloc(root, &this); + if (!dentry) + return ERR_PTR(-ENOMEM); + + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + + error = -ENOSPC; + inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto close_file; + + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + file->f_vfsmnt = mntget(shm_mnt); + file->f_dentry = dentry; + file->f_op = &shmem_file_operations; + file->f_mode = FMODE_WRITE | FMODE_READ; + return file; + +close_file: + put_filp(file); +put_dentry: + dput(dentry); + return ERR_PTR(error); +} + +/* + * shmem_zero_setup - setup a shared anonymous mapping + * + * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + file = shmem_file_setup("dev/zero", size); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +} + +EXPORT_SYMBOL(shmem_file_setup); diff --git a/uClinux-2.4.31-uc0/mm/slab.c b/uClinux-2.4.31-uc0/mm/slab.c new file mode 100644 index 0000000..46ffd37 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/slab.c @@ -0,0 +1,2078 @@ +/* + * linux/mm/slab.c + * Written by Mark Hemment, 1996/97. + * (markhe@nextd.demon.co.uk) + * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * + * Major cleanup, different bufctl logic, per-cpu arrays + * (c) 2000 Manfred Spraul + * + * An implementation of the Slab Allocator as described in outline in; + * UNIX Internals: The New Frontiers by Uresh Vahalia + * Pub: Prentice Hall ISBN 0-13-101908-2 + * or with a little more detail in; + * The Slab Allocator: An Object-Caching Kernel Memory Allocator + * Jeff Bonwick (Sun Microsystems). + * Presented at: USENIX Summer 1994 Technical Conference + * + * + * The memory is organized in caches, one cache for each object type. + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) + * Each cache consists out of many slabs (they are small (usually one + * page long) and always contiguous), and each slab contains multiple + * initialized objects. + * + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, + * normal). If you need a special memory type, then must create a new + * cache for that memory type. + * + * In order to reduce fragmentation, the slabs are sorted in 3 groups: + * full slabs with 0 free objects + * partial slabs + * empty slabs with no allocated objects + * + * If partial slabs exist, then new allocations come from these slabs, + * otherwise from empty slabs or new slabs are allocated. + * + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache + * during kmem_cache_destroy(). The caller must prevent concurrent allocs. + * + * On SMP systems, each cache has a short per-cpu head array, most allocs + * and frees go into that array, and if that array overflows, then 1/2 + * of the entries in the array are given back into the global cache. + * This reduces the number of spinlock operations. + * + * The c_cpuarray may not be read with enabled local interrupts. + * + * SMP synchronization: + * constructors and destructors are called without any locking. + * Several members in kmem_cache_t and slab_t never change, they + * are accessed without any locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking. + * The non-constant members are protected with a per-cache irq spinlock. + * + * Further notes from the original documentation: + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the semaphore 'cache_chain_sem'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * + * To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which + * maybe be sleeping and therefore not holding the semaphore/lock), the + * growing field is used. This also prevents reaping from a cache. + * + * At present, each engine can be growing a cache. This should be blocked. + * + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, + * SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define DEBUG 1 +#define STATS 1 +#define FORCED_DEBUG 1 +#else +#define DEBUG 0 +#define STATS 0 +#define FORCED_DEBUG 0 +#endif + +/* + * Parameters for kmem_cache_reap + */ +#define REAP_SCANLEN 10 +#define REAP_PERFECT 10 + +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) + +/* Legal flag mask for kmem_cache_create(). */ +#if DEBUG +# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_HWCACHE_ALIGN | \ + SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_MUST_HWCACHE_ALIGN) +#else +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ + SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN) +#endif + +/* + * kmem_bufctl_t: + * + * Bufctl's are used for linking objs within a slab + * linked offsets. + * + * This implementation relies on "struct page" for locating the cache & + * slab an object belongs to. + * This allows the bufctl structure to be small (one int), but limits + * the number of objects a slab (not a cache) can contain when off-slab + * bufctls are used. The limit is the size of the largest general cache + * that does not use off-slab slabs. + * For 32bit archs with 4 kB pages, is this 56. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. + */ + +#define BUFCTL_END 0xffffFFFF +#define SLAB_LIMIT 0xffffFFFE +typedef unsigned int kmem_bufctl_t; + +/* Max number of objs-per-slab for caches which use off-slab slabs. + * Needed to avoid a possible looping condition in kmem_cache_grow(). + */ +static unsigned long offslab_limit; + +/* + * slab_t + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +typedef struct slab_s { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; +} slab_t; + +#define slab_bufctl(slabp) \ + ((kmem_bufctl_t *)(((slab_t*)slabp)+1)) + +/* + * cpucache_t + * + * Per cpu structures + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + */ +typedef struct cpucache_s { + unsigned int avail; + unsigned int limit; +} cpucache_t; + +#define cc_entry(cpucache) \ + ((void **)(((cpucache_t*)(cpucache))+1)) +#define cc_data(cachep) \ + ((cachep)->cpudata[smp_processor_id()]) +/* + * kmem_cache_t + * + * manages a cache. + */ + +#define CACHE_NAMELEN 20 /* max name length for a slab cache */ + +struct kmem_cache_s { +/* 1) each alloc & free */ + /* full, partial first, then free */ + struct list_head slabs_full; + struct list_head slabs_partial; + struct list_head slabs_free; + unsigned int objsize; + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + spinlock_t spinlock; +#ifdef CONFIG_SMP + unsigned int batchcount; +#endif + +/* 2) slab additions /removals */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + unsigned int gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + unsigned int colour_next; /* cache colouring */ + kmem_cache_t *slabp_cache; + unsigned int growing; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor)(void *, kmem_cache_t *, unsigned long); + + /* de-constructor func */ + void (*dtor)(void *, kmem_cache_t *, unsigned long); + + unsigned long failures; + +/* 3) cache creation/removal */ + char name[CACHE_NAMELEN]; + struct list_head next; +#ifdef CONFIG_SMP +/* 4) per-cpu data */ + cpucache_t *cpudata[NR_CPUS]; +#endif +#if STATS + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long grown; + unsigned long reaped; + unsigned long errors; +#ifdef CONFIG_SMP + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; +#endif +#endif +}; + +/* internal c_flags */ +#define CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */ +#define CFLGS_OPTIMIZE 0x020000UL /* optimized slab lookup */ + +/* c_dflags (dynamic flags). Need to hold the spinlock to access this member */ +#define DFLGS_GROWN 0x000001UL /* don't reap a recently grown */ + +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +#define OPTIMIZE(x) ((x)->flags & CFLGS_OPTIMIZE) +#define GROWN(x) ((x)->dlags & DFLGS_GROWN) + +#if STATS +#define STATS_INC_ACTIVE(x) ((x)->num_active++) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_INC_REAPED(x) ((x)->reaped++) +#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_INC_ERR(x) ((x)->errors++) +#else +#define STATS_INC_ACTIVE(x) do { } while (0) +#define STATS_DEC_ACTIVE(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_INC_REAPED(x) do { } while (0) +#define STATS_SET_HIGH(x) do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) +#endif + +#if STATS && defined(CONFIG_SMP) +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#else +#define STATS_INC_ALLOCHIT(x) do { } while (0) +#define STATS_INC_ALLOCMISS(x) do { } while (0) +#define STATS_INC_FREEHIT(x) do { } while (0) +#define STATS_INC_FREEMISS(x) do { } while (0) +#endif + +#if DEBUG +/* Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. + */ +#define RED_MAGIC1 0x5A2CF071UL /* when obj is active */ +#define RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */ + +/* ...and for poisoning */ +#define POISON_BYTE 0x5a /* byte value for poisoning */ +#define POISON_END 0xa5 /* end-byte of poisoning */ + +#endif + +/* maximum size of an obj (in 2^order pages) */ +#define MAX_OBJ_ORDER 5 /* 32 pages */ + +/* + * Do not go above this order unless 0 objects fit into the slab. + */ +#define BREAK_GFP_ORDER_HI 2 +#define BREAK_GFP_ORDER_LO 1 +static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; + +/* + * Absolute limit for the gfp order + */ +#define MAX_GFP_ORDER 5 /* 32 pages */ + + +/* Macros for storing/retrieving the cachep and or slab from the + * global 'mem_map'. These are used to find the slab an obj belongs to. + * With kfree(), these are used to find the cache which an obj belongs to. + */ +#define SET_PAGE_CACHE(pg,x) ((pg)->list.next = (struct list_head *)(x)) +#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->list.next) +#define SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x)) +#define GET_PAGE_SLAB(pg) ((slab_t *)(pg)->list.prev) + +/* Size description struct for general caches. */ +typedef struct cache_sizes { + size_t cs_size; + kmem_cache_t *cs_cachep; + kmem_cache_t *cs_dmacachep; +} cache_sizes_t; + +static cache_sizes_t cache_sizes[] = { +#if PAGE_SIZE == 4096 + { 32, NULL, NULL}, +#endif + { 64, NULL, NULL}, + { 128, NULL, NULL}, + { 256, NULL, NULL}, + { 512, NULL, NULL}, + { 1024, NULL, NULL}, + { 2048, NULL, NULL}, + { 4096, NULL, NULL}, + { 8192, NULL, NULL}, + { 16384, NULL, NULL}, + { 32768, NULL, NULL}, + { 65536, NULL, NULL}, + {131072, NULL, NULL}, + { 0, NULL, NULL} +}; + +/* internal cache of cache description objs */ +static kmem_cache_t cache_cache = { + slabs_full: LIST_HEAD_INIT(cache_cache.slabs_full), + slabs_partial: LIST_HEAD_INIT(cache_cache.slabs_partial), + slabs_free: LIST_HEAD_INIT(cache_cache.slabs_free), + objsize: sizeof(kmem_cache_t), + flags: SLAB_NO_REAP, + spinlock: SPIN_LOCK_UNLOCKED, + colour_off: L1_CACHE_BYTES, + name: "kmem_cache", +}; + +/* Guard access to the cache-chain. */ +static struct semaphore cache_chain_sem; + +/* Place maintainer for reaping. */ +static kmem_cache_t *clock_searchp = &cache_cache; + +#define cache_chain (cache_cache.next) + +#ifdef CONFIG_SMP +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static int g_cpucache_up; + +static void enable_cpucache (kmem_cache_t *cachep); +static void enable_all_cpucaches (void); +#endif + +/* Cal the num objs, wastage, and bytes left over for a given slab size. */ +static void kmem_cache_estimate (unsigned long gfporder, size_t size, + int flags, size_t *left_over, unsigned int *num) +{ + int i; + size_t wastage = PAGE_SIZE<<gfporder; + size_t extra = 0; + size_t base = 0; + + if (!(flags & CFLGS_OFF_SLAB)) { + base = sizeof(slab_t); + extra = sizeof(kmem_bufctl_t); + } + i = 0; + while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage) + i++; + if (i > 0) + i--; + + if (i > SLAB_LIMIT) + i = SLAB_LIMIT; + + *num = i; + wastage -= i*size; + wastage -= L1_CACHE_ALIGN(base+i*extra); + *left_over = wastage; +} + +/* Initialisation - setup the `cache' cache. */ +void __init kmem_cache_init(void) +{ + size_t left_over; + + init_MUTEX(&cache_chain_sem); + INIT_LIST_HEAD(&cache_chain); + + kmem_cache_estimate(0, cache_cache.objsize, 0, + &left_over, &cache_cache.num); + if (!cache_cache.num) + BUG(); + + cache_cache.colour = left_over/cache_cache.colour_off; + cache_cache.colour_next = 0; +} + + +/* Initialisation - setup remaining internal and general caches. + * Called after the gfp() functions have been enabled, and before smp_init(). + */ +void __init kmem_cache_sizes_init(void) +{ + cache_sizes_t *sizes = cache_sizes; + char name[20]; + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory. + */ + if (num_physpages > (32 << 20) >> PAGE_SHIFT) + slab_break_gfp_order = BREAK_GFP_ORDER_HI; + do { + /* For performance, all the general caches are L1 aligned. + * This should be particularly beneficial on SMP boxes, as it + * eliminates "false sharing". + * Note for systems short on memory removing the alignment will + * allow tighter packing of the smaller caches. */ + snprintf(name, sizeof(name), "size-%Zd",sizes->cs_size); + if (!(sizes->cs_cachep = + kmem_cache_create(name, sizes->cs_size, + 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) { + BUG(); + } + + /* Inc off-slab bufctl limit until the ceiling is hit. */ + if (!(OFF_SLAB(sizes->cs_cachep))) { + offslab_limit = sizes->cs_size-sizeof(slab_t); + offslab_limit /= 2; + } + snprintf(name, sizeof(name), "size-%Zd(DMA)",sizes->cs_size); + sizes->cs_dmacachep = kmem_cache_create(name, sizes->cs_size, 0, + SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!sizes->cs_dmacachep) + BUG(); + sizes++; + } while (sizes->cs_size); +} + +int __init kmem_cpucache_init(void) +{ +#ifdef CONFIG_SMP + g_cpucache_up = 1; + enable_all_cpucaches(); +#endif + return 0; +} + +__initcall(kmem_cpucache_init); + +/* Interface to system's page allocator. No need to hold the cache-lock. + */ +static inline void * kmem_getpages (kmem_cache_t *cachep, unsigned long flags) +{ + void *addr; + + /* + * If we requested dmaable memory, we will get it. Even if we + * did not request dmaable memory, we might get it, but that + * would be relatively rare and ignorable. + */ + flags |= cachep->gfpflags; + addr = (void*) __get_free_pages(flags, cachep->gfporder); + /* Assume that now we have the pages no one else can legally + * messes with the 'struct page's. + * However vm_scan() might try to test the structure to see if + * it is a named-page or buffer-page. The members it tests are + * of no interest here..... + */ + return addr; +} + +/* Interface to system's page release. */ +static inline void kmem_freepages (kmem_cache_t *cachep, void *addr) +{ + unsigned long i = (1<<cachep->gfporder); + struct page *page = virt_to_page(addr); + + /* free_pages() does not clear the type bit - we do that. + * The pages have been unlinked from their cache-slab, + * but their 'struct page's might be accessed in + * vm_scan(). Shouldn't be a worry. + */ + while (i--) { + PageClearSlab(page); + page++; + } + free_pages((unsigned long)addr, cachep->gfporder); +} + +#if DEBUG +static inline void kmem_poison_obj (kmem_cache_t *cachep, void *addr) +{ + int size = cachep->objsize; + if (cachep->flags & SLAB_RED_ZONE) { + addr += BYTES_PER_WORD; + size -= 2*BYTES_PER_WORD; + } + memset(addr, POISON_BYTE, size); + *(unsigned char *)(addr+size-1) = POISON_END; +} + +static inline int kmem_check_poison_obj (kmem_cache_t *cachep, void *addr) +{ + int size = cachep->objsize; + void *end; + if (cachep->flags & SLAB_RED_ZONE) { + addr += BYTES_PER_WORD; + size -= 2*BYTES_PER_WORD; + } + end = memchr(addr, POISON_END, size); + if (end != (addr+size-1)) + return 1; + return 0; +} +#endif + +/* Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. + * The cache-lock is not held/needed. + */ +static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp) +{ + if (cachep->dtor +#if DEBUG + || cachep->flags & (SLAB_POISON | SLAB_RED_ZONE) +#endif + ) { + int i; + for (i = 0; i < cachep->num; i++) { + void* objp = slabp->s_mem+cachep->objsize*i; +#if DEBUG + if (cachep->flags & SLAB_RED_ZONE) { + if (*((unsigned long*)(objp)) != RED_MAGIC1) + BUG(); + if (*((unsigned long*)(objp + cachep->objsize + -BYTES_PER_WORD)) != RED_MAGIC1) + BUG(); + objp += BYTES_PER_WORD; + } +#endif + if (cachep->dtor) + (cachep->dtor)(objp, cachep, 0); +#if DEBUG + if (cachep->flags & SLAB_RED_ZONE) { + objp -= BYTES_PER_WORD; + } + if ((cachep->flags & SLAB_POISON) && + kmem_check_poison_obj(cachep, objp)) + BUG(); +#endif + } + } + + kmem_freepages(cachep, slabp->s_mem-slabp->colouroff); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); +} + +/** + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @offset: The offset to use within the page. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * @dtor: A destructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a int, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache + * and the @dtor is run before the pages are handed back. + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_NO_REAP - Don't automatically reap this cache when we're under + * memory pressure. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +kmem_cache_t * +kmem_cache_create (const char *name, size_t size, size_t offset, + unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), + void (*dtor)(void*, kmem_cache_t *, unsigned long)) +{ + const char *func_nm = KERN_ERR "kmem_create: "; + size_t left_over, align, slab_size; + kmem_cache_t *cachep = NULL; + + /* + * Sanity checks... these are all serious usage bugs. + */ + if ((!name) || + ((strlen(name) >= CACHE_NAMELEN - 1)) || + in_interrupt() || + (size < BYTES_PER_WORD) || + (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || + (dtor && !ctor) || + (offset < 0 || offset > size)) + BUG(); + +#if DEBUG + if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { + /* No constructor, but inital state check requested */ + printk("%sNo con, but init state check requested - %s\n", func_nm, name); + flags &= ~SLAB_DEBUG_INITIAL; + } + + if ((flags & SLAB_POISON) && ctor) { + /* request for poisoning, but we can't do that with a constructor */ + printk("%sPoisoning requested, but con given - %s\n", func_nm, name); + flags &= ~SLAB_POISON; + } +#if FORCED_DEBUG + if ((size < (PAGE_SIZE>>3)) && !(flags & SLAB_MUST_HWCACHE_ALIGN)) + /* + * do not red zone large object, causes severe + * fragmentation. + */ + flags |= SLAB_RED_ZONE; + if (!ctor) + flags |= SLAB_POISON; +#endif +#endif + + /* + * Always checks flags, a caller might be expecting debug + * support which isn't available. + */ + BUG_ON(flags & ~CREATE_MASK); + + /* Get cache's description obj. */ + cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); + if (!cachep) + goto opps; + memset(cachep, 0, sizeof(kmem_cache_t)); + + /* Check that size is in terms of words. This is needed to avoid + * unaligned accesses for some archs when redzoning is used, and makes + * sure any on-slab bufctl's are also correctly aligned. + */ + if (size & (BYTES_PER_WORD-1)) { + size += (BYTES_PER_WORD-1); + size &= ~(BYTES_PER_WORD-1); + printk("%sForcing size word alignment - %s\n", func_nm, name); + } + +#if DEBUG + if (flags & SLAB_RED_ZONE) { + /* + * There is no point trying to honour cache alignment + * when redzoning. + */ + flags &= ~SLAB_HWCACHE_ALIGN; + size += 2*BYTES_PER_WORD; /* words for redzone */ + } +#endif + align = BYTES_PER_WORD; + if (flags & SLAB_HWCACHE_ALIGN) + align = L1_CACHE_BYTES; + + /* Determine if the slab management is 'on' or 'off' slab. */ + if (size >= (PAGE_SIZE>>3)) + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + flags |= CFLGS_OFF_SLAB; + + if (flags & SLAB_HWCACHE_ALIGN) { + /* Need to adjust size so that objs are cache aligned. */ + /* Small obj size, can get at least two per cache line. */ + /* FIXME: only power of 2 supported, was better */ + while (size < align/2) + align /= 2; + size = (size+align-1)&(~(align-1)); + } + + /* Cal size (in pages) of slabs, and the num of objs per slab. + * This could be made much more intelligent. For now, try to avoid + * using high page-orders for slabs. When the gfp() funcs are more + * friendly towards high-order requests, this should be changed. + */ + do { + unsigned int break_flag = 0; +cal_wastage: + kmem_cache_estimate(cachep->gfporder, size, flags, + &left_over, &cachep->num); + if (break_flag) + break; + if (cachep->gfporder >= MAX_GFP_ORDER) + break; + if (!cachep->num) + goto next; + if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) { + /* Oops, this num of objs will cause problems. */ + cachep->gfporder--; + break_flag++; + goto cal_wastage; + } + + /* + * Large num of objs is good, but v. large slabs are currently + * bad for the gfp()s. + */ + if (cachep->gfporder >= slab_break_gfp_order) + break; + + if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) + break; /* Acceptable internal fragmentation. */ +next: + cachep->gfporder++; + } while (1); + + if (!cachep->num) { + printk("kmem_cache_create: couldn't create cache %s.\n", name); + kmem_cache_free(&cache_cache, cachep); + cachep = NULL; + goto opps; + } + slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(slab_t)); + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + flags &= ~CFLGS_OFF_SLAB; + left_over -= slab_size; + } + + /* Offset must be a multiple of the alignment. */ + offset += (align-1); + offset &= ~(align-1); + if (!offset) + offset = L1_CACHE_BYTES; + cachep->colour_off = offset; + cachep->colour = left_over/offset; + + /* init remaining fields */ + if (!cachep->gfporder && !(flags & CFLGS_OFF_SLAB)) + flags |= CFLGS_OPTIMIZE; + + cachep->flags = flags; + cachep->gfpflags = 0; + if (flags & SLAB_CACHE_DMA) + cachep->gfpflags |= GFP_DMA; + spin_lock_init(&cachep->spinlock); + cachep->objsize = size; + INIT_LIST_HEAD(&cachep->slabs_full); + INIT_LIST_HEAD(&cachep->slabs_partial); + INIT_LIST_HEAD(&cachep->slabs_free); + + if (flags & CFLGS_OFF_SLAB) + cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); + cachep->ctor = ctor; + cachep->dtor = dtor; + /* Copy name over so we don't have problems with unloaded modules */ + strcpy(cachep->name, name); + +#ifdef CONFIG_SMP + if (g_cpucache_up) + enable_cpucache(cachep); +#endif + /* Need the semaphore to access the chain. */ + down(&cache_chain_sem); + { + struct list_head *p; + + list_for_each(p, &cache_chain) { + kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); + + /* The name field is constant - no lock needed. */ + if (!strcmp(pc->name, name)) + BUG(); + } + } + + /* There is no reason to lock our new cache before we + * link it in - no one knows about it yet... + */ + list_add(&cachep->next, &cache_chain); + up(&cache_chain_sem); +opps: + return cachep; +} + + +#if DEBUG +/* + * This check if the kmem_cache_t pointer is chained in the cache_cache + * list. -arca + */ +static int is_chained_kmem_cache(kmem_cache_t * cachep) +{ + struct list_head *p; + int ret = 0; + + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); + list_for_each(p, &cache_chain) { + if (p == &cachep->next) { + ret = 1; + break; + } + } + up(&cache_chain_sem); + + return ret; +} +#else +#define is_chained_kmem_cache(x) 1 +#endif + +#ifdef CONFIG_SMP +/* + * Waits for all CPUs to execute func(). + */ +static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) +{ + local_irq_disable(); + func(arg); + local_irq_enable(); + + if (smp_call_function(func, arg, 1, 1)) + BUG(); +} +typedef struct ccupdate_struct_s +{ + kmem_cache_t *cachep; + cpucache_t *new[NR_CPUS]; +} ccupdate_struct_t; + +static void do_ccupdate_local(void *info) +{ + ccupdate_struct_t *new = (ccupdate_struct_t *)info; + cpucache_t *old = cc_data(new->cachep); + + cc_data(new->cachep) = new->new[smp_processor_id()]; + new->new[smp_processor_id()] = old; +} + +static void free_block (kmem_cache_t* cachep, void** objpp, int len); + +static void drain_cpu_caches(kmem_cache_t *cachep) +{ + ccupdate_struct_t new; + int i; + + memset(&new.new,0,sizeof(new.new)); + + new.cachep = cachep; + + down(&cache_chain_sem); + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + + for (i = 0; i < smp_num_cpus; i++) { + cpucache_t* ccold = new.new[cpu_logical_map(i)]; + if (!ccold || (ccold->avail == 0)) + continue; + local_irq_disable(); + free_block(cachep, cc_entry(ccold), ccold->avail); + local_irq_enable(); + ccold->avail = 0; + } + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + up(&cache_chain_sem); +} + +#else +#define drain_cpu_caches(cachep) do { } while (0) +#endif + +/* + * Called with the &cachep->spinlock held, returns number of slabs released + */ +static int __kmem_cache_shrink_locked(kmem_cache_t *cachep) +{ + slab_t *slabp; + int ret = 0; + + /* If the cache is growing, stop shrinking. */ + while (!cachep->growing) { + struct list_head *p; + + p = cachep->slabs_free.prev; + if (p == &cachep->slabs_free) + break; + + slabp = list_entry(cachep->slabs_free.prev, slab_t, list); +#if DEBUG + if (slabp->inuse) + BUG(); +#endif + list_del(&slabp->list); + + spin_unlock_irq(&cachep->spinlock); + kmem_slab_destroy(cachep, slabp); + ret++; + spin_lock_irq(&cachep->spinlock); + } + return ret; +} + +static int __kmem_cache_shrink(kmem_cache_t *cachep) +{ + int ret; + + drain_cpu_caches(cachep); + + spin_lock_irq(&cachep->spinlock); + __kmem_cache_shrink_locked(cachep); + ret = !list_empty(&cachep->slabs_full) || + !list_empty(&cachep->slabs_partial); + spin_unlock_irq(&cachep->spinlock); + return ret; +} + +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * Returns number of pages released. + */ +int kmem_cache_shrink(kmem_cache_t *cachep) +{ + int ret; + + if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep)) + BUG(); + + spin_lock_irq(&cachep->spinlock); + ret = __kmem_cache_shrink_locked(cachep); + spin_unlock_irq(&cachep->spinlock); + + return ret << cachep->gfporder; +} + +/** + * kmem_cache_destroy - delete a cache + * @cachep: the cache to destroy + * + * Remove a kmem_cache_t object from the slab cache. + * Returns 0 on success. + * + * It is expected this function will be called by a module when it is + * unloaded. This will remove the cache completely, and avoid a duplicate + * cache being allocated each time a module is loaded and unloaded, if the + * module doesn't have persistent in-kernel storage across loads and unloads. + * + * The cache must be empty before calling this function. + * + * The caller must guarantee that noone will allocate memory from the cache + * during the kmem_cache_destroy(). + */ +int kmem_cache_destroy (kmem_cache_t * cachep) +{ + if (!cachep || in_interrupt() || cachep->growing) + BUG(); + + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); + /* the chain is never empty, cache_cache is never destroyed */ + if (clock_searchp == cachep) + clock_searchp = list_entry(cachep->next.next, + kmem_cache_t, next); + list_del(&cachep->next); + up(&cache_chain_sem); + + if (__kmem_cache_shrink(cachep)) { + printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n", + cachep); + down(&cache_chain_sem); + list_add(&cachep->next,&cache_chain); + up(&cache_chain_sem); + return 1; + } +#ifdef CONFIG_SMP + { + int i; + for (i = 0; i < NR_CPUS; i++) + kfree(cachep->cpudata[i]); + } +#endif + kmem_cache_free(&cache_cache, cachep); + + return 0; +} + +/* Get the memory for a slab management obj. */ +static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep, + void *objp, int colour_off, int local_flags) +{ + slab_t *slabp; + + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ + slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); + if (!slabp) + return NULL; + } else { + /* FIXME: change to + slabp = objp + * if you enable OPTIMIZE + */ + slabp = objp+colour_off; + colour_off += L1_CACHE_ALIGN(cachep->num * + sizeof(kmem_bufctl_t) + sizeof(slab_t)); + } + slabp->inuse = 0; + slabp->colouroff = colour_off; + slabp->s_mem = objp+colour_off; + + return slabp; +} + +static inline void kmem_cache_init_objs (kmem_cache_t * cachep, + slab_t * slabp, unsigned long ctor_flags) +{ + int i; + + for (i = 0; i < cachep->num; i++) { + void* objp = slabp->s_mem+cachep->objsize*i; +#if DEBUG + if (cachep->flags & SLAB_RED_ZONE) { + *((unsigned long*)(objp)) = RED_MAGIC1; + *((unsigned long*)(objp + cachep->objsize - + BYTES_PER_WORD)) = RED_MAGIC1; + objp += BYTES_PER_WORD; + } +#endif + + /* + * Constructors are not allowed to allocate memory from + * the same cache which they are a constructor for. + * Otherwise, deadlock. They must also be threaded. + */ + if (cachep->ctor) + cachep->ctor(objp, cachep, ctor_flags); +#if DEBUG + if (cachep->flags & SLAB_RED_ZONE) + objp -= BYTES_PER_WORD; + if (cachep->flags & SLAB_POISON) + /* need to poison the objs */ + kmem_poison_obj(cachep, objp); + if (cachep->flags & SLAB_RED_ZONE) { + if (*((unsigned long*)(objp)) != RED_MAGIC1) + BUG(); + if (*((unsigned long*)(objp + cachep->objsize - + BYTES_PER_WORD)) != RED_MAGIC1) + BUG(); + } +#endif + slab_bufctl(slabp)[i] = i+1; + } + slab_bufctl(slabp)[i-1] = BUFCTL_END; + slabp->free = 0; +} + +/* + * Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. + */ +static int kmem_cache_grow (kmem_cache_t * cachep, int flags) +{ + slab_t *slabp; + struct page *page; + void *objp; + size_t offset; + unsigned int i, local_flags; + unsigned long ctor_flags; + unsigned long save_flags; + + /* Be lazy and only check for valid flags here, + * keeping it out of the critical path in kmem_cache_alloc(). + */ + if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) + BUG(); + if (flags & SLAB_NO_GROW) + return 0; + + /* + * The test for missing atomic flag is performed here, rather than + * the more obvious place, simply to reduce the critical path length + * in kmem_cache_alloc(). If a caller is seriously mis-behaving they + * will eventually be caught here (where it matters). + */ + if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) + BUG(); + + ctor_flags = SLAB_CTOR_CONSTRUCTOR; + local_flags = (flags & SLAB_LEVEL_MASK); + if (local_flags == SLAB_ATOMIC) + /* + * Not allowed to sleep. Need to tell a constructor about + * this - it might need to know... + */ + ctor_flags |= SLAB_CTOR_ATOMIC; + + /* About to mess with non-constant members - lock. */ + spin_lock_irqsave(&cachep->spinlock, save_flags); + + /* Get colour for the slab, and cal the next value. */ + offset = cachep->colour_next; + cachep->colour_next++; + if (cachep->colour_next >= cachep->colour) + cachep->colour_next = 0; + offset *= cachep->colour_off; + cachep->dflags |= DFLGS_GROWN; + + cachep->growing++; + spin_unlock_irqrestore(&cachep->spinlock, save_flags); + + /* A series of memory allocations for a new slab. + * Neither the cache-chain semaphore, or cache-lock, are + * held, but the incrementing c_growing prevents this + * cache from being reaped or shrunk. + * Note: The cache could be selected in for reaping in + * kmem_cache_reap(), but when the final test is made the + * growing value will be seen. + */ + + /* Get mem for the objs. */ + if (!(objp = kmem_getpages(cachep, flags))) + goto failed; + + /* Get slab management. */ + if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags))) + goto opps1; + + /* Nasty!!!!!! I hope this is OK. */ + i = 1 << cachep->gfporder; + page = virt_to_page(objp); + do { + SET_PAGE_CACHE(page, cachep); + SET_PAGE_SLAB(page, slabp); + PageSetSlab(page); + page++; + } while (--i); + + kmem_cache_init_objs(cachep, slabp, ctor_flags); + + spin_lock_irqsave(&cachep->spinlock, save_flags); + cachep->growing--; + + /* Make slab active. */ + list_add_tail(&slabp->list, &cachep->slabs_free); + STATS_INC_GROWN(cachep); + cachep->failures = 0; + + spin_unlock_irqrestore(&cachep->spinlock, save_flags); + return 1; +opps1: + kmem_freepages(cachep, objp); +failed: + spin_lock_irqsave(&cachep->spinlock, save_flags); + cachep->growing--; + spin_unlock_irqrestore(&cachep->spinlock, save_flags); + return 0; +} + +/* + * Perform extra freeing checks: + * - detect double free + * - detect bad pointers. + * Called with the cache-lock held. + */ + +#if DEBUG +static int kmem_extra_free_checks (kmem_cache_t * cachep, + slab_t *slabp, void * objp) +{ + int i; + unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize; + + if (objnr >= cachep->num) + BUG(); + if (objp != slabp->s_mem + objnr*cachep->objsize) + BUG(); + + /* Check slab's freelist to see if this obj is there. */ + for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { + if (i == objnr) + BUG(); + } + return 0; +} +#endif + +static inline void kmem_cache_alloc_head(kmem_cache_t *cachep, int flags) +{ + if (flags & SLAB_DMA) { + if (!(cachep->gfpflags & GFP_DMA)) + BUG(); + } else { + if (cachep->gfpflags & GFP_DMA) + BUG(); + } +} + +static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep, + slab_t *slabp) +{ + void *objp; + + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + /* get obj pointer */ + slabp->inuse++; + objp = slabp->s_mem + slabp->free*cachep->objsize; + slabp->free=slab_bufctl(slabp)[slabp->free]; + + if (unlikely(slabp->free == BUFCTL_END)) { + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_full); + } +#if DEBUG + if (cachep->flags & SLAB_POISON) + if (kmem_check_poison_obj(cachep, objp)) + BUG(); + if (cachep->flags & SLAB_RED_ZONE) { + /* Set alloc red-zone, and check old one. */ + if (xchg((unsigned long *)objp, RED_MAGIC2) != + RED_MAGIC1) + BUG(); + if (xchg((unsigned long *)(objp+cachep->objsize - + BYTES_PER_WORD), RED_MAGIC2) != RED_MAGIC1) + BUG(); + objp += BYTES_PER_WORD; + } +#endif + return objp; +} + +/* + * Returns a ptr to an obj in the given cache. + * caller must guarantee synchronization + * #define for the goto optimization 8-) + */ +#define kmem_cache_alloc_one(cachep) \ +({ \ + struct list_head * slabs_partial, * entry; \ + slab_t *slabp; \ + \ + slabs_partial = &(cachep)->slabs_partial; \ + entry = slabs_partial->next; \ + if (unlikely(entry == slabs_partial)) { \ + struct list_head * slabs_free; \ + slabs_free = &(cachep)->slabs_free; \ + entry = slabs_free->next; \ + if (unlikely(entry == slabs_free)) \ + goto alloc_new_slab; \ + list_del(entry); \ + list_add(entry, slabs_partial); \ + } \ + \ + slabp = list_entry(entry, slab_t, list); \ + kmem_cache_alloc_one_tail(cachep, slabp); \ +}) + +#ifdef CONFIG_SMP +void* kmem_cache_alloc_batch(kmem_cache_t* cachep, cpucache_t* cc, int flags) +{ + int batchcount = cachep->batchcount; + + spin_lock(&cachep->spinlock); + while (batchcount--) { + struct list_head * slabs_partial, * entry; + slab_t *slabp; + /* Get slab alloc is to come from. */ + slabs_partial = &(cachep)->slabs_partial; + entry = slabs_partial->next; + if (unlikely(entry == slabs_partial)) { + struct list_head * slabs_free; + slabs_free = &(cachep)->slabs_free; + entry = slabs_free->next; + if (unlikely(entry == slabs_free)) + break; + list_del(entry); + list_add(entry, slabs_partial); + } + + slabp = list_entry(entry, slab_t, list); + cc_entry(cc)[cc->avail++] = + kmem_cache_alloc_one_tail(cachep, slabp); + } + spin_unlock(&cachep->spinlock); + + if (cc->avail) + return cc_entry(cc)[--cc->avail]; + return NULL; +} +#endif + +static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags) +{ + unsigned long save_flags; + void* objp; + + kmem_cache_alloc_head(cachep, flags); +try_again: + local_irq_save(save_flags); +#ifdef CONFIG_SMP + { + cpucache_t *cc = cc_data(cachep); + + if (cc) { + if (cc->avail) { + STATS_INC_ALLOCHIT(cachep); + objp = cc_entry(cc)[--cc->avail]; + } else { + STATS_INC_ALLOCMISS(cachep); + objp = kmem_cache_alloc_batch(cachep,cc,flags); + if (!objp) + goto alloc_new_slab_nolock; + } + } else { + spin_lock(&cachep->spinlock); + objp = kmem_cache_alloc_one(cachep); + spin_unlock(&cachep->spinlock); + } + } +#else + objp = kmem_cache_alloc_one(cachep); +#endif + local_irq_restore(save_flags); + return objp; +alloc_new_slab: +#ifdef CONFIG_SMP + spin_unlock(&cachep->spinlock); +alloc_new_slab_nolock: +#endif + local_irq_restore(save_flags); + if (kmem_cache_grow(cachep, flags)) + /* Someone may have stolen our objs. Doesn't matter, we'll + * just come back here again. + */ + goto try_again; + return NULL; +} + +/* + * Release an obj back to its cache. If the obj has a constructed + * state, it should be in this state _before_ it is released. + * - caller is responsible for the synchronization + */ + +#if DEBUG +# define CHECK_NR(pg) \ + do { \ + if (!VALID_PAGE(pg)) { \ + printk(KERN_ERR "kfree: out of range ptr %lxh.\n", \ + (unsigned long)objp); \ + BUG(); \ + } \ + } while (0) +# define CHECK_PAGE(page) \ + do { \ + CHECK_NR(page); \ + if (!PageSlab(page)) { \ + printk(KERN_ERR "kfree: bad ptr %lxh.\n", \ + (unsigned long)objp); \ + BUG(); \ + } \ + } while (0) + +#else +# define CHECK_PAGE(pg) do { } while (0) +#endif + +static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp) +{ + slab_t* slabp; + + CHECK_PAGE(virt_to_page(objp)); + /* reduces memory footprint + * + if (OPTIMIZE(cachep)) + slabp = (void*)((unsigned long)objp&(~(PAGE_SIZE-1))); + else + */ + slabp = GET_PAGE_SLAB(virt_to_page(objp)); + +#if DEBUG + if (cachep->flags & SLAB_DEBUG_INITIAL) + /* Need to call the slab's constructor so the + * caller can perform a verify of its state (debugging). + * Called without the cache-lock held. + */ + cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); + + if (cachep->flags & SLAB_RED_ZONE) { + objp -= BYTES_PER_WORD; + if (xchg((unsigned long *)objp, RED_MAGIC1) != RED_MAGIC2) + /* Either write before start, or a double free. */ + BUG(); + if (xchg((unsigned long *)(objp+cachep->objsize - + BYTES_PER_WORD), RED_MAGIC1) != RED_MAGIC2) + /* Either write past end, or a double free. */ + BUG(); + } + if (cachep->flags & SLAB_POISON) + kmem_poison_obj(cachep, objp); + if (kmem_extra_free_checks(cachep, slabp, objp)) + return; +#endif + { + unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize; + + slab_bufctl(slabp)[objnr] = slabp->free; + slabp->free = objnr; + } + STATS_DEC_ACTIVE(cachep); + + /* fixup slab chains */ + { + int inuse = slabp->inuse; + if (unlikely(!--slabp->inuse)) { + /* Was partial or full, now empty. */ + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_free); + } else if (unlikely(inuse == cachep->num)) { + /* Was full. */ + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_partial); + } + } +} + +#ifdef CONFIG_SMP +static inline void __free_block (kmem_cache_t* cachep, + void** objpp, int len) +{ + for ( ; len > 0; len--, objpp++) + kmem_cache_free_one(cachep, *objpp); +} + +static void free_block (kmem_cache_t* cachep, void** objpp, int len) +{ + spin_lock(&cachep->spinlock); + __free_block(cachep, objpp, len); + spin_unlock(&cachep->spinlock); +} +#endif + +/* + * __kmem_cache_free + * called with disabled ints + */ +static inline void __kmem_cache_free (kmem_cache_t *cachep, void* objp) +{ +#ifdef CONFIG_SMP + cpucache_t *cc = cc_data(cachep); + + CHECK_PAGE(virt_to_page(objp)); + if (cc) { + int batchcount; + if (cc->avail < cc->limit) { + STATS_INC_FREEHIT(cachep); + cc_entry(cc)[cc->avail++] = objp; + return; + } + STATS_INC_FREEMISS(cachep); + batchcount = cachep->batchcount; + cc->avail -= batchcount; + free_block(cachep, + &cc_entry(cc)[cc->avail],batchcount); + cc_entry(cc)[cc->avail++] = objp; + return; + } else { + free_block(cachep, &objp, 1); + } +#else + kmem_cache_free_one(cachep, objp); +#endif +} + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. The flags are only relevant + * if the cache has no available objects. + */ +void * kmem_cache_alloc (kmem_cache_t *cachep, int flags) +{ + return __kmem_cache_alloc(cachep, flags); +} + +/** + * kmalloc - allocate memory + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * kmalloc is the normal method of allocating memory + * in the kernel. + * + * The @flags argument may be one of: + * + * %GFP_USER - Allocate memory on behalf of user. May sleep. + * + * %GFP_KERNEL - Allocate normal kernel ram. May sleep. + * + * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. + * + * Additionally, the %GFP_DMA flag may be set to indicate the memory + * must be suitable for DMA. This can mean different things on different + * platforms. For example, on i386, it means that the memory must come + * from the first 16MB. + */ +void * kmalloc (size_t size, int flags) +{ + cache_sizes_t *csizep = cache_sizes; + + for (; csizep->cs_size; csizep++) { + if (size > csizep->cs_size) + continue; + return __kmem_cache_alloc(flags & GFP_DMA ? + csizep->cs_dmacachep : csizep->cs_cachep, flags); + } + return NULL; +} + +/** + * kmem_cache_free - Deallocate an object + * @cachep: The cache the allocation was from. + * @objp: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ +void kmem_cache_free (kmem_cache_t *cachep, void *objp) +{ + unsigned long flags; +#if DEBUG + CHECK_PAGE(virt_to_page(objp)); + if (cachep != GET_PAGE_CACHE(virt_to_page(objp))) + BUG(); +#endif + + local_irq_save(flags); + __kmem_cache_free(cachep, objp); + local_irq_restore(flags); +} + +/** + * kfree - free previously allocated memory + * @objp: pointer returned by kmalloc. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree (const void *objp) +{ + kmem_cache_t *c; + unsigned long flags; + + if (!objp) + return; + local_irq_save(flags); + CHECK_PAGE(virt_to_page(objp)); + c = GET_PAGE_CACHE(virt_to_page(objp)); + __kmem_cache_free(c, (void*)objp); + local_irq_restore(flags); +} + +unsigned int kmem_cache_size(kmem_cache_t *cachep) +{ +#if DEBUG + if (cachep->flags & SLAB_RED_ZONE) + return (cachep->objsize - 2*BYTES_PER_WORD); +#endif + return cachep->objsize; +} + +kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) +{ + cache_sizes_t *csizep = cache_sizes; + + /* This function could be moved to the header file, and + * made inline so consumers can quickly determine what + * cache pointer they require. + */ + for ( ; csizep->cs_size; csizep++) { + if (size > csizep->cs_size) + continue; + break; + } + return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep; +} + +#ifdef CONFIG_SMP + +/* called with cache_chain_sem acquired. */ +static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount) +{ + ccupdate_struct_t new; + int i; + + /* + * These are admin-provided, so we are more graceful. + */ + if (limit < 0) + return -EINVAL; + if (batchcount < 0) + return -EINVAL; + if (batchcount > limit) + return -EINVAL; + if (limit != 0 && !batchcount) + return -EINVAL; + + memset(&new.new,0,sizeof(new.new)); + if (limit) { + for (i = 0; i< smp_num_cpus; i++) { + cpucache_t* ccnew; + + ccnew = kmalloc(sizeof(void*)*limit+ + sizeof(cpucache_t), GFP_KERNEL); + if (!ccnew) + goto oom; + ccnew->limit = limit; + ccnew->avail = 0; + new.new[cpu_logical_map(i)] = ccnew; + } + } + new.cachep = cachep; + spin_lock_irq(&cachep->spinlock); + cachep->batchcount = batchcount; + spin_unlock_irq(&cachep->spinlock); + + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + + for (i = 0; i < smp_num_cpus; i++) { + cpucache_t* ccold = new.new[cpu_logical_map(i)]; + if (!ccold) + continue; + local_irq_disable(); + free_block(cachep, cc_entry(ccold), ccold->avail); + local_irq_enable(); + kfree(ccold); + } + return 0; +oom: + for (i--; i >= 0; i--) + kfree(new.new[cpu_logical_map(i)]); + return -ENOMEM; +} + +static void enable_cpucache (kmem_cache_t *cachep) +{ + int err; + int limit; + + /* FIXME: optimize */ + if (cachep->objsize > PAGE_SIZE) + return; + if (cachep->objsize > 1024) + limit = 60; + else if (cachep->objsize > 256) + limit = 124; + else + limit = 252; + + err = kmem_tune_cpucache(cachep, limit, limit/2); + if (err) + printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + cachep->name, -err); +} + +static void enable_all_cpucaches (void) +{ + struct list_head* p; + + down(&cache_chain_sem); + + p = &cache_cache.next; + do { + kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next); + + enable_cpucache(cachep); + p = cachep->next.next; + } while (p != &cache_cache.next); + + up(&cache_chain_sem); +} +#endif + +/** + * kmem_cache_reap - Reclaim memory from caches. + * @gfp_mask: the type of memory required. + * + * Called from do_try_to_free_pages() and __alloc_pages() + */ +int fastcall kmem_cache_reap (int gfp_mask) +{ + slab_t *slabp; + kmem_cache_t *searchp; + kmem_cache_t *best_cachep; + unsigned int best_pages; + unsigned int best_len; + unsigned int scan; + int ret = 0; + + if (gfp_mask & __GFP_WAIT) + down(&cache_chain_sem); + else + if (down_trylock(&cache_chain_sem)) + return 0; + + scan = REAP_SCANLEN; + best_len = 0; + best_pages = 0; + best_cachep = NULL; + searchp = clock_searchp; + do { + unsigned int pages; + struct list_head* p; + unsigned int full_free; + + /* It's safe to test this without holding the cache-lock. */ + if (searchp->flags & SLAB_NO_REAP) + goto next; + spin_lock_irq(&searchp->spinlock); + if (searchp->growing) + goto next_unlock; + if (searchp->dflags & DFLGS_GROWN) { + searchp->dflags &= ~DFLGS_GROWN; + goto next_unlock; + } +#ifdef CONFIG_SMP + { + cpucache_t *cc = cc_data(searchp); + if (cc && cc->avail) { + __free_block(searchp, cc_entry(cc), cc->avail); + cc->avail = 0; + } + } +#endif + + full_free = 0; + p = searchp->slabs_free.next; + while (p != &searchp->slabs_free) { +#if DEBUG + slabp = list_entry(p, slab_t, list); + + if (slabp->inuse) + BUG(); +#endif + full_free++; + p = p->next; + } + + /* + * Try to avoid slabs with constructors and/or + * more than one page per slab (as it can be difficult + * to get high orders from gfp()). + */ + pages = full_free * (1<<searchp->gfporder); + if (searchp->ctor) + pages = (pages*4+1)/5; + if (searchp->gfporder) + pages = (pages*4+1)/5; + if (pages > best_pages) { + best_cachep = searchp; + best_len = full_free; + best_pages = pages; + if (pages >= REAP_PERFECT) { + clock_searchp = list_entry(searchp->next.next, + kmem_cache_t,next); + goto perfect; + } + } +next_unlock: + spin_unlock_irq(&searchp->spinlock); +next: + searchp = list_entry(searchp->next.next,kmem_cache_t,next); + } while (--scan && searchp != clock_searchp); + + clock_searchp = searchp; + + if (!best_cachep) + /* couldn't find anything to reap */ + goto out; + + spin_lock_irq(&best_cachep->spinlock); +perfect: + /* free only 50% of the free slabs */ + best_len = (best_len + 1)/2; + for (scan = 0; scan < best_len; scan++) { + struct list_head *p; + + if (best_cachep->growing) + break; + p = best_cachep->slabs_free.prev; + if (p == &best_cachep->slabs_free) + break; + slabp = list_entry(p,slab_t,list); +#if DEBUG + if (slabp->inuse) + BUG(); +#endif + list_del(&slabp->list); + STATS_INC_REAPED(best_cachep); + + /* Safe to drop the lock. The slab is no longer linked to the + * cache. + */ + spin_unlock_irq(&best_cachep->spinlock); + kmem_slab_destroy(best_cachep, slabp); + spin_lock_irq(&best_cachep->spinlock); + } + spin_unlock_irq(&best_cachep->spinlock); + ret = scan * (1 << best_cachep->gfporder); +out: + up(&cache_chain_sem); + return ret; +} + +#ifdef CONFIG_PROC_FS + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + down(&cache_chain_sem); + if (!n) + return (void *)1; + p = &cache_cache.next; + while (--n) { + p = p->next; + if (p == &cache_cache.next) + return NULL; + } + return list_entry(p, kmem_cache_t, next); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + kmem_cache_t *cachep = p; + ++*pos; + if (p == (void *)1) + return &cache_cache; + cachep = list_entry(cachep->next.next, kmem_cache_t, next); + return cachep == &cache_cache ? NULL : cachep; +} + +static void s_stop(struct seq_file *m, void *p) +{ + up(&cache_chain_sem); +} + +static int s_show(struct seq_file *m, void *p) +{ + kmem_cache_t *cachep = p; + struct list_head *q; + slab_t *slabp; + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs = 0; + unsigned long num_slabs; + const char *name; + + if (p == (void*)1) { + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ + seq_puts(m, "slabinfo - version: 1.1" +#if STATS + " (statistics)" +#endif +#ifdef CONFIG_SMP + " (SMP)" +#endif + "\n"); + return 0; + } + + spin_lock_irq(&cachep->spinlock); + active_objs = 0; + num_slabs = 0; + list_for_each(q,&cachep->slabs_full) { + slabp = list_entry(q, slab_t, list); + if (slabp->inuse != cachep->num) + BUG(); + active_objs += cachep->num; + active_slabs++; + } + list_for_each(q,&cachep->slabs_partial) { + slabp = list_entry(q, slab_t, list); + if (slabp->inuse == cachep->num || !slabp->inuse) + BUG(); + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each(q,&cachep->slabs_free) { + slabp = list_entry(q, slab_t, list); + if (slabp->inuse) + BUG(); + num_slabs++; + } + num_slabs+=active_slabs; + num_objs = num_slabs*cachep->num; + + name = cachep->name; + { + char tmp; + mm_segment_t old_fs; + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (__get_user(tmp, name)) + name = "broken"; + set_fs(old_fs); + } + + seq_printf(m, "%-17s %6lu %6lu %6u %4lu %4lu %4u", + name, active_objs, num_objs, cachep->objsize, + active_slabs, num_slabs, (1<<cachep->gfporder)); + +#if STATS + { + unsigned long errors = cachep->errors; + unsigned long high = cachep->high_mark; + unsigned long grown = cachep->grown; + unsigned long reaped = cachep->reaped; + unsigned long allocs = cachep->num_allocations; + + seq_printf(m, " : %6lu %7lu %5lu %4lu %4lu", + high, allocs, grown, reaped, errors); + } +#endif +#ifdef CONFIG_SMP + { + cpucache_t *cc = cc_data(cachep); + unsigned int batchcount = cachep->batchcount; + unsigned int limit; + + if (cc) + limit = cc->limit; + else + limit = 0; + seq_printf(m, " : %4u %4u", + limit, batchcount); + } +#endif +#if STATS && defined(CONFIG_SMP) + { + unsigned long allochit = atomic_read(&cachep->allochit); + unsigned long allocmiss = atomic_read(&cachep->allocmiss); + unsigned long freehit = atomic_read(&cachep->freehit); + unsigned long freemiss = atomic_read(&cachep->freemiss); + seq_printf(m, " : %6lu %6lu %6lu %6lu", + allochit, allocmiss, freehit, freemiss); + } +#endif + spin_unlock_irq(&cachep->spinlock); + seq_putc(m, '\n'); + return 0; +} + +/** + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ + +struct seq_operations slabinfo_op = { + start: s_start, + next: s_next, + stop: s_stop, + show: s_show +}; + +#define MAX_SLABINFO_WRITE 128 +/** + * slabinfo_write - SMP tuning for the slab allocator + * @file: unused + * @buffer: user buffer + * @count: data len + * @data: unused + */ +ssize_t slabinfo_write(struct file *file, const char *buffer, + size_t count, loff_t *ppos) +{ +#ifdef CONFIG_SMP + char kbuf[MAX_SLABINFO_WRITE+1], *tmp; + int limit, batchcount, res; + struct list_head *p; + + if (count > MAX_SLABINFO_WRITE) + return -EINVAL; + if (copy_from_user(&kbuf, buffer, count)) + return -EFAULT; + kbuf[MAX_SLABINFO_WRITE] = '\0'; + + tmp = strchr(kbuf, ' '); + if (!tmp) + return -EINVAL; + *tmp = '\0'; + tmp++; + limit = simple_strtol(tmp, &tmp, 10); + while (*tmp == ' ') + tmp++; + batchcount = simple_strtol(tmp, &tmp, 10); + + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); + res = -EINVAL; + list_for_each(p,&cache_chain) { + kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); + + if (!strcmp(cachep->name, kbuf)) { + res = kmem_tune_cpucache(cachep, limit, batchcount); + break; + } + } + up(&cache_chain_sem); + if (res >= 0) + res = count; + return res; +#else + return -EINVAL; +#endif +} +#endif diff --git a/uClinux-2.4.31-uc0/mm/swap.c b/uClinux-2.4.31-uc0/mm/swap.c new file mode 100644 index 0000000..41e8ed6 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/swap.c @@ -0,0 +1,185 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file contains the default values for the opereation of the + * Linux VM subsystem. Fine-tuning documentation can be found in + * linux/Documentation/sysctl/vm.txt. + * Started 18.12.91 + * Swap aging added 23.2.95, Stephen Tweedie. + * Buffermem limits added 12.3.98, Rik van Riel. + */ + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/pagemap.h> +#include <linux/init.h> + +#include <asm/dma.h> +#include <asm/uaccess.h> /* for copy_to/from_user */ +#include <asm/pgtable.h> + +/* How many pages do we try to swap or page in/out together? */ +int page_cluster; + +pager_daemon_t pager_daemon = { + 512, /* base number for calculating the number of tries */ + SWAP_CLUSTER_MAX, /* minimum number of tries */ + 8, /* do swap I/O in clusters of this size */ +}; + +/* + * Move an inactive page to the active list. + */ +static inline void activate_page_nolock(struct page * page) +{ + if (PageLRU(page) && !PageActive(page)) { + del_page_from_inactive_list(page); + add_page_to_active_list(page); + } +} + +void fastcall activate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + activate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void fastcall lru_cache_add(struct page * page) +{ + if (!PageLRU(page)) { + spin_lock(&pagemap_lru_lock); + if (!TestSetPageLRU(page)) + add_page_to_inactive_list(page); + spin_unlock(&pagemap_lru_lock); + } +} + +/** + * __lru_cache_del: remove a page from the page lists + * @page: the page to add + * + * This function is for when the caller already holds + * the pagemap_lru_lock. + */ +void fastcall __lru_cache_del(struct page * page) +{ + if (TestClearPageLRU(page)) { + if (PageActive(page)) { + del_page_from_active_list(page); + } else { + del_page_from_inactive_list(page); + } + } +} + +/** + * lru_cache_del: remove a page from the page lists + * @page: the page to remove + */ +void fastcall lru_cache_del(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + __lru_cache_del(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * delta_nr_active_pages: alter the number of active pages. + * + * @page: the page which is being activated/deactivated + * @delta: +1 for activation, -1 for deactivation + * + * Called under pagecache_lock + */ +void delta_nr_active_pages(struct page *page, long delta) +{ + pg_data_t *pgdat; + zone_t *classzone, *overflow; + + classzone = page_zone(page); + pgdat = classzone->zone_pgdat; + overflow = pgdat->node_zones + pgdat->nr_zones; + + while (classzone < overflow) { + classzone->nr_active_pages += delta; + classzone++; + } + nr_active_pages += delta; +} + +/** + * delta_nr_inactive_pages: alter the number of inactive pages. + * + * @page: the page which is being deactivated/activated + * @delta: +1 for deactivation, -1 for activation + * + * Called under pagecache_lock + */ +void delta_nr_inactive_pages(struct page *page, long delta) +{ + pg_data_t *pgdat; + zone_t *classzone, *overflow; + + classzone = page_zone(page); + pgdat = classzone->zone_pgdat; + overflow = pgdat->node_zones + pgdat->nr_zones; + + while (classzone < overflow) { + classzone->nr_inactive_pages += delta; + classzone++; + } + nr_inactive_pages += delta; +} + +/** + * delta_nr_cache_pages: alter the number of pages in the pagecache + * + * @page: the page which is being added/removed + * @delta: +1 for addition, -1 for removal + * + * Called under pagecache_lock + */ +void delta_nr_cache_pages(struct page *page, long delta) +{ + pg_data_t *pgdat; + zone_t *classzone, *overflow; + + classzone = page_zone(page); + pgdat = classzone->zone_pgdat; + overflow = pgdat->node_zones + pgdat->nr_zones; + + while (classzone < overflow) { + classzone->nr_cache_pages += delta; + classzone++; + } + page_cache_size += delta; +} + +/* + * Perform any setup for the swap system + */ +void __init swap_setup(void) +{ + unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + + /* Use a smaller cluster for small-memory machines */ + if (megs < 16) + page_cluster = 2; + else + page_cluster = 3; + /* + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ +} diff --git a/uClinux-2.4.31-uc0/mm/swap_state.c b/uClinux-2.4.31-uc0/mm/swap_state.c new file mode 100644 index 0000000..6c7e86d --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/swap_state.c @@ -0,0 +1,231 @@ +/* + * linux/mm/swap_state.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * + * Rewritten to use page cache, (C) 1998 Stephen Tweedie + */ + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> + +#include <asm/pgtable.h> + +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ +static int swap_writepage(struct page *page) +{ + if (remove_exclusive_swap_page(page)) { + UnlockPage(page); + return 0; + } + rw_swap_page(WRITE, page); + return 0; +} + +static struct address_space_operations swap_aops = { + writepage: swap_writepage, + sync_page: block_sync_page, +}; + +struct address_space swapper_space = { + LIST_HEAD_INIT(swapper_space.clean_pages), + LIST_HEAD_INIT(swapper_space.dirty_pages), + LIST_HEAD_INIT(swapper_space.locked_pages), + 0, /* nrpages */ + &swap_aops, +}; + +#ifdef SWAP_CACHE_INFO +#define INC_CACHE_INFO(x) (swap_cache_info.x++) + +static struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; +} swap_cache_info; + +void show_swap_cache_info(void) +{ + printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", + swap_cache_info.add_total, swap_cache_info.del_total, + swap_cache_info.find_success, swap_cache_info.find_total, + swap_cache_info.noent_race, swap_cache_info.exist_race); +} +#else +#define INC_CACHE_INFO(x) do { } while (0) +#endif + +int add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + if (page->mapping) + BUG(); + if (!swap_duplicate(entry)) { + INC_CACHE_INFO(noent_race); + return -ENOENT; + } + if (add_to_page_cache_unique(page, &swapper_space, entry.val, + page_hash(&swapper_space, entry.val)) != 0) { + swap_free(entry); + INC_CACHE_INFO(exist_race); + return -EEXIST; + } + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + BUG(); + INC_CACHE_INFO(add_total); + return 0; +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache. + */ +void __delete_from_swap_cache(struct page *page) +{ + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + BUG(); + ClearPageDirty(page); + __remove_inode_page(page); + INC_CACHE_INFO(del_total); +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache and locked. + * It will never put the page into the free list, + * the caller has a reference on the page. + */ +void delete_from_swap_cache(struct page *page) +{ + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + + if (unlikely(!block_flushpage(page, 0))) + BUG(); /* an anonymous page cannot have page->buffers set */ + + entry.val = page->index; + + spin_lock(&pagecache_lock); + __delete_from_swap_cache(page); + spin_unlock(&pagecache_lock); + + swap_free(entry); + page_cache_release(page); +} + +/* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. Can not do a lock_page, + * as we are holding the page_table_lock spinlock. + */ +void free_page_and_swap_cache(struct page *page) +{ + /* + * If we are the only user, then try to free up the swap cache. + * + * Its ok to check for PageSwapCache without the page lock + * here because we are going to recheck again inside + * exclusive_swap_page() _with_ the lock. + * - Marcelo + */ + if (PageSwapCache(page) && !TryLockPage(page)) { + remove_exclusive_swap_page(page); + UnlockPage(page); + } + page_cache_release(page); +} + +/* + * Lookup a swap entry in the swap cache. A found page will be returned + * unlocked and with its refcount incremented - we rely on the kernel + * lock getting page table operations atomic even if we drop the page + * lock before returning. + */ +struct page * lookup_swap_cache(swp_entry_t entry) +{ + struct page *found; + + found = find_get_page(&swapper_space, entry.val); + /* + * Unsafe to assert PageSwapCache and mapping on page found: + * if SMP nothing prevents swapoff from deleting this page from + * the swap cache at this moment. find_lock_page would prevent + * that, but no need to change: we _have_ got the right page. + */ + INC_CACHE_INFO(find_total); + if (found) + INC_CACHE_INFO(find_success); + return found; +} + +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page * read_swap_cache_async(swp_entry_t entry) +{ + struct page *found_page, *new_page = NULL; + int err; + + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics: use find_get_page() + * directly. + */ + found_page = find_get_page(&swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + break; /* Out of memory */ + } + + /* + * Associate the page with swap entry in the swap cache. + * May fail (-ENOENT) if swap entry has been freed since + * our caller observed it. May fail (-EEXIST) if there + * is already a page associated with this entry in the + * swap cache: added by a racing read_swap_cache_async, + * or by try_to_swap_out (or shmem_writepage) re-using + * the just freed swap entry for an existing page. + */ + err = add_to_swap_cache(new_page, entry); + if (!err) { + /* + * Initiate read into locked page and return. + */ + rw_swap_page(READ, new_page); + return new_page; + } + } while (err != -ENOENT); + + if (new_page) + page_cache_release(new_page); + return found_page; +} diff --git a/uClinux-2.4.31-uc0/mm/swapfile.c b/uClinux-2.4.31-uc0/mm/swapfile.c new file mode 100644 index 0000000..d6eeba6 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/swapfile.c @@ -0,0 +1,1268 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/blkdev.h> /* for blk_size */ +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/shm.h> + +#include <asm/pgtable.h> + +spinlock_t swaplock = SPIN_LOCK_UNLOCKED; +unsigned int nr_swapfiles; +int total_swap_pages; +static int swap_overflow; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +struct swap_list_t swap_list = {-1, -1}; + +struct swap_info_struct swap_info[MAX_SWAPFILES]; + +#define SWAPFILE_CLUSTER 256 + +static inline int scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset; + /* + * We try to cluster swap pages by allocating them + * sequentially in swap. Once we've allocated + * SWAPFILE_CLUSTER pages this way, however, we resort to + * first-free allocation, starting a new cluster. This + * prevents us from scattering swap pages all over the entire + * swap partition, so that we reduce overall disk seek times + * between swap pages. -- sct */ + if (si->cluster_nr) { + while (si->cluster_next <= si->highest_bit) { + offset = si->cluster_next++; + if (si->swap_map[offset]) + continue; + si->cluster_nr--; + goto got_page; + } + } + si->cluster_nr = SWAPFILE_CLUSTER; + + /* try to find an empty (even not aligned) cluster. */ + offset = si->lowest_bit; + check_next_cluster: + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) + { + int nr; + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) + if (si->swap_map[nr]) + { + offset = nr+1; + goto check_next_cluster; + } + /* We found a completly empty cluster, so start + * using it. + */ + goto got_page; + } + /* No luck, so now go finegrined as usual. -Andrea */ + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { + if (si->swap_map[offset]) + continue; + si->lowest_bit = offset+1; + got_page: + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + if (si->lowest_bit > si->highest_bit) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + nr_swap_pages--; + si->cluster_next = offset+1; + return offset; + } + si->lowest_bit = si->max; + si->highest_bit = 0; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset; + swp_entry_t entry; + int type, wrapped = 0; + + entry.val = 0; /* Out of memory */ + swap_list_lock(); + type = swap_list.next; + if (type < 0) + goto out; + if (nr_swap_pages <= 0) + goto out; + + while (1) { + p = &swap_info[type]; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + swap_device_lock(p); + offset = scan_swap_map(p); + swap_device_unlock(p); + if (offset) { + entry = SWP_ENTRY(type,offset); + type = swap_info[type].next; + if (type < 0 || + p->prio != swap_info[type].prio) { + swap_list.next = swap_list.head; + } else { + swap_list.next = type; + } + goto out; + } + } + type = p->next; + if (!wrapped) { + if (type < 0 || p->prio != swap_info[type].prio) { + type = swap_list.head; + wrapped = 1; + } + } else + if (type < 0) + goto out; /* out of swap space */ + } +out: + swap_list_unlock(); + return entry; +} + +static struct swap_info_struct * swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + return p; + +bad_free: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static void swap_info_put(struct swap_info_struct * p) +{ + swap_device_unlock(p); + swap_list_unlock(); +} + +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +{ + int count = p->swap_map[offset]; + + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = count; + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + } + } + return count; +} + +/* + * Caller has made sure that the swapdevice corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, SWP_OFFSET(entry)); + swap_info_put(p); + } +} + +/* + * Check if we're the only user of a swap page, + * when the page is locked. + */ +static int exclusive_swap_page(struct page *page) +{ + int retval = 0; + struct swap_info_struct * p; + swp_entry_t entry; + + entry.val = page->index; + p = swap_info_get(entry); + if (p) { + /* Is the only swap cache user the cache itself? */ + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) + retval = 1; + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + } + return retval; +} + +/* + * We can use this swap cache entry directly + * if there are no other references to it. + * + * Here "exclusive_swap_page()" does the real + * work, but we opportunistically check whether + * we need to get all the locks first.. + */ +int fastcall can_share_swap_page(struct page *page) +{ + int retval = 0; + + if (!PageLocked(page)) + BUG(); + switch (page_count(page)) { + case 3: + if (!page->buffers) + break; + /* Fallthrough */ + case 2: + if (!PageSwapCache(page)) + break; + retval = exclusive_swap_page(page); + break; + case 1: + if (PageReserved(page)) + break; + retval = 1; + } + return retval; +} + +/* + * Work out if there are any other processes sharing this + * swap cache page. Free it if you can. Return success. + */ +int fastcall remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + return 0; + if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ + return 0; + + entry.val = page->index; + p = swap_info_get(entry); + if (!p) + return 0; + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + + if (retval) { + block_flushpage(page, 0); + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +void free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct * p; + struct page *page = NULL; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) + page = find_trylock_page(&swapper_space, entry.val); + swap_info_put(p); + } + if (page) { + page_cache_get(page); + /* Only cache user (+us), or swap space full? Free it! */ + if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + UnlockPage(page); + page_cache_release(page); + } +} + +/* + * The swap entry has been read in advance, and we return 1 to indicate + * that the page has been used or is no longer needed. + * + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + */ +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, + pte_t *dir, swp_entry_t entry, struct page* page) +{ + pte_t pte = *dir; + + if (likely(pte_to_swp_entry(pte).val != entry.val)) + return; + if (unlikely(pte_none(pte) || pte_present(pte))) + return; + get_page(page); + set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + swap_free(entry); + ++vma->vm_mm->rss; +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, unsigned long offset, + swp_entry_t entry, struct page* page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + swp_entry_t entry, struct page* page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + if (address >= end) + BUG(); + do { + unuse_pmd(vma, pmd, address, end - address, offset, entry, + page); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page* page) +{ + unsigned long start = vma->vm_start, end = vma->vm_end; + + if (start >= end) + BUG(); + do { + unuse_pgd(vma, pgdir, start, end - start, entry, page); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (start && (start < end)); +} + +static void unuse_process(struct mm_struct * mm, + swp_entry_t entry, struct page* page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + unuse_vma(vma, pgd, entry, page); + } + spin_unlock(&mm->page_table_lock); + return; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static int find_next_to_unuse(struct swap_info_struct *si, int prev) +{ + int max = si->max; + int i = prev; + int count; + + /* + * No need for swap_device_lock(si) here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_list_lock()). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + */ +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct * si = &swap_info[type]; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + int i = 0; + int retval = 0; + int reset_overflow = 0; + int shmem; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering (now preserved by swap_out()), + * which clusters forked address spaces together, most recent + * child immediately after parent. If we race with dup_mmap(), + * we very much want to resolve parent before child, otherwise + * we may miss some entries: using last mm would invert that. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * mmput() removes mm from mmlist before exit_mmap() and its + * zap_page_range(). That's not too bad, those entries are + * on their way out, and handled faster there than here. + * do_munmap() behaves similarly, taking the range out of mm's + * vma list before zap_page_range(). But unfortunately, when + * unmapping a part of a vma, it takes the whole out first, + * then reinserts what's left after (might even reschedule if + * open() method called) - so swap entries may be invisible + * to swapoff for a while, then reappear - but that is rare. + */ + while ((i = find_next_to_unuse(si, i))) { + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = SWP_ENTRY(type, i); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page(page); + lock_page(page); + + /* + * Remove all references to entry, without blocking. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + shmem = 0; + swcount = *swap_map; + if (swcount > 1) { + flush_page_to_ram(page); + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else + unuse_process(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *mm; + + spin_lock(&mmlist_lock); + while (*swap_map > 1 && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + swcount = *swap_map; + if (mm == &init_mm) { + set_start_mm = 1; + spin_unlock(&mmlist_lock); + shmem = shmem_unuse(entry, page); + spin_lock(&mmlist_lock); + } else + unuse_process(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + new_start_mm = mm; + set_start_mm = 0; + } + } + atomic_inc(&new_start_mm->mm_users); + spin_unlock(&mmlist_lock); + mmput(start_mm); + start_mm = new_start_mm; + } + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + swap_list_lock(); + swap_device_lock(si); + nr_swap_pages++; + *swap_map = 1; + swap_device_unlock(si); + swap_list_unlock(); + reset_overflow = 1; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_swap_out could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * + * Note shmem_unuse already deleted swappage from cache, + * unless corresponding filepage found already in cache: + * in which case it left swappage in cache, lowered its + * swap count to pass quickly through the loops above, + * and now we must reincrement count to try again later. + */ + if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + rw_swap_page(WRITE, page); + lock_page(page); + } + if (PageSwapCache(page)) { + if (shmem) + swap_duplicate(entry); + else + delete_from_swap_cache(page); + } + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so try_to_swap_out will preserve it. + */ + SetPageDirty(page); + UnlockPage(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. Interruptible check on + * signal_pending() would be nice, but changes the spec? + */ + if (current->need_resched) + schedule(); + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; +} + +asmlinkage long sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p = NULL; + unsigned short *swap_map; + struct nameidata nd; + int i, type, prev; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = user_path_walk(specialfile, &nd); + if (err) + goto out; + + lock_kernel(); + prev = -1; + swap_list_lock(); + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + if (p->swap_file == nd.dentry || + (S_ISBLK(nd.dentry->d_inode->i_mode) && + p->swap_device == nd.dentry->d_inode->i_rdev)) + break; + } + prev = type; + } + err = -EINVAL; + if (type < 0) { + swap_list_unlock(); + goto out_dput; + } + + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + nr_swap_pages -= p->pages; + total_swap_pages -= p->pages; + p->flags = SWP_USED; + swap_list_unlock(); + unlock_kernel(); + err = try_to_unuse(type); + lock_kernel(); + if (err) { + /* re-insert swap space back into swap_list */ + swap_list_lock(); + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio >= swap_info[i].prio) + break; + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + p->flags = SWP_WRITEOK; + swap_list_unlock(); + goto out_dput; + } + if (p->swap_device) + blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); + path_release(&nd); + + swap_list_lock(); + swap_device_lock(p); + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_vfsmnt = NULL; + p->swap_file = NULL; + p->swap_device = 0; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + p->flags = 0; + swap_device_unlock(p); + swap_list_unlock(); + vfree(swap_map); + err = 0; + +out_dput: + unlock_kernel(); + path_release(&nd); +out: + return err; +} + +int get_swaparea_info(char *buf) +{ + char * page = (char *) __get_free_page(GFP_KERNEL); + struct swap_info_struct *ptr = swap_info; + int i, j, len = 0, usedswap; + + if (!page) + return -ENOMEM; + + len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if ((ptr->flags & SWP_USED) && ptr->swap_map) { + char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, + page, PAGE_SIZE); + + len += sprintf(buf + len, "%-31s ", path); + + if (!ptr->swap_device) + len += sprintf(buf + len, "file\t\t"); + else + len += sprintf(buf + len, "partition\t"); + + usedswap = 0; + for (j = 0; j < ptr->max; ++j) + switch (ptr->swap_map[j]) { + case SWAP_MAP_BAD: + case 0: + continue; + default: + usedswap++; + } + len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), + usedswap << (PAGE_SHIFT - 10), ptr->prio); + } + } + free_page((unsigned long) page); + return len; +} + +int is_swap_partition(kdev_t dev) { + struct swap_info_struct *ptr = swap_info; + int i; + + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if (ptr->flags & SWP_USED) + if (ptr->swap_device == dev) + return 1; + } + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage long sys_swapon(const char * specialfile, int swap_flags) +{ + struct swap_info_struct * p; + struct nameidata nd; + struct inode * swap_inode; + unsigned int type; + int i, j, prev; + int error; + static int least_priority = 0; + union swap_header *swap_header = 0; + int swap_header_version; + int nr_good_pages = 0; + unsigned long maxpages = 1; + int swapfilesize; + struct block_device *bdev = NULL; + unsigned short *swap_map; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + lock_kernel(); + swap_list_lock(); + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + error = -EPERM; + if (type >= MAX_SWAPFILES) { + swap_list_unlock(); + goto out; + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->cluster_nr = 0; + p->sdev_lock = SPIN_LOCK_UNLOCKED; + p->next = -1; + if (swap_flags & SWAP_FLAG_PREFER) { + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; + } else { + p->prio = --least_priority; + } + swap_list_unlock(); + error = user_path_walk(specialfile, &nd); + if (error) + goto bad_swap_2; + + p->swap_file = nd.dentry; + p->swap_vfsmnt = nd.mnt; + swap_inode = nd.dentry->d_inode; + error = -EINVAL; + + if (S_ISBLK(swap_inode->i_mode)) { + kdev_t dev = swap_inode->i_rdev; + struct block_device_operations *bdops; + devfs_handle_t de; + + if (is_mounted(dev)) { + error = -EBUSY; + goto bad_swap_2; + } + + p->swap_device = dev; + set_blocksize(dev, PAGE_SIZE); + + bd_acquire(swap_inode); + bdev = swap_inode->i_bdev; + de = devfs_get_handle_from_inode(swap_inode); + bdops = devfs_get_ops(de); /* Increments module use count */ + if (bdops) bdev->bd_op = bdops; + + error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); + devfs_put_ops(de);/*Decrement module use count now we're safe*/ + if (error) + goto bad_swap_2; + set_blocksize(dev, PAGE_SIZE); + error = -ENODEV; + if (!dev || (blk_size[MAJOR(dev)] && + !blk_size[MAJOR(dev)][MINOR(dev)])) + goto bad_swap; + swapfilesize = 0; + if (blk_size[MAJOR(dev)]) + swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] + >> (PAGE_SHIFT - 10); + } else if (S_ISREG(swap_inode->i_mode)) + swapfilesize = swap_inode->i_size >> PAGE_SHIFT; + else + goto bad_swap; + + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + struct swap_info_struct *q = &swap_info[i]; + if (i == type || !q->swap_file) + continue; + if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) + goto bad_swap; + } + + swap_header = (void *) __get_free_page(GFP_USER); + if (!swap_header) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + + lock_page(virt_to_page(swap_header)); + rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); + + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) + swap_header_version = 1; + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) + swap_header_version = 2; + else { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + + switch (swap_header_version) { + case 1: + memset(((char *) swap_header)+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,(char *) swap_header)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + maxpages = i+1; + j++; + } + } + nr_good_pages = j; + p->swap_map = vmalloc(maxpages * sizeof(short)); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < maxpages ; i++) { + if (test_bit(i,(char *) swap_header)) + p->swap_map[i] = 0; + else + p->swap_map[i] = SWAP_MAP_BAD; + } + break; + + case 2: + /* Check the swap header's sub-version and the size of + the swap file and bad block lists */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + error = -EINVAL; + goto bad_swap; + } + + p->lowest_bit = 1; + maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; + + error = -EINVAL; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + /* OK, set up the swap map and apply the bad block list */ + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + error = -ENOMEM; + goto bad_swap; + } + + error = 0; + memset(p->swap_map, 0, maxpages * sizeof(short)); + for (i=0; i<swap_header->info.nr_badpages; i++) { + int page = swap_header->info.badpages[i]; + if (page <= 0 || page >= swap_header->info.last_page) + error = -EINVAL; + else + p->swap_map[page] = SWAP_MAP_BAD; + } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (error) + goto bad_swap; + } + + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + error = -EINVAL; + goto bad_swap; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map[0] = SWAP_MAP_BAD; + swap_list_lock(); + swap_device_lock(p); + p->max = maxpages; + p->flags = SWP_WRITEOK; + p->pages = nr_good_pages; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", + nr_good_pages<<(PAGE_SHIFT-10), p->prio); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + swap_device_unlock(p); + swap_list_unlock(); + error = 0; + goto out; +bad_swap: + if (bdev) + blkdev_put(bdev, BDEV_SWAP); +bad_swap_2: + swap_list_lock(); + swap_map = p->swap_map; + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_device = 0; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_map = NULL; + p->flags = 0; + if (!(swap_flags & SWAP_FLAG_PREFER)) + ++least_priority; + swap_list_unlock(); + if (swap_map) + vfree(swap_map); + path_release(&nd); +out: + if (swap_header) + free_page((long) swap_header); + unlock_kernel(); + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i; + unsigned long nr_to_be_unused = 0; + + swap_list_lock(); + for (i = 0; i < nr_swapfiles; i++) { + unsigned int j; + if (swap_info[i].flags != SWP_USED) + continue; + for (j = 0; j < swap_info[i].max; ++j) { + switch (swap_info[i].swap_map[j]) { + case 0: + case SWAP_MAP_BAD: + continue; + default: + nr_to_be_unused++; + } + } + } + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + swap_list_unlock(); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as + * "permanent", but will be reclaimed by the next swapoff. + */ +int swap_duplicate(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int result = 0; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + + swap_device_lock(p); + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } + } + swap_device_unlock(p); +out: + return result; + +bad_file: + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +/* + * Prior swap_duplicate protects against swap device deletion. + */ +void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, + kdev_t *dev, struct inode **swapf) +{ + unsigned long type; + struct swap_info_struct *p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); + return; + } + + p = &swap_info[type]; + *offset = SWP_OFFSET(entry); + if (*offset >= p->max && *offset != 0) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); + return; + } + if (p->swap_map && !p->swap_map[*offset]) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); + return; + } + if (!(p->flags & SWP_USED)) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); + return; + } + + if (p->swap_device) { + *dev = p->swap_device; + } else if (p->swap_file) { + *swapf = p->swap_file->d_inode; + } else { + printk(KERN_ERR "rw_swap_page: no swap file or device\n"); + } + return; +} + +/* + * swap_device_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. + */ +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +{ + int ret = 0, i = 1 << page_cluster; + unsigned long toff; + struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; + + if (!page_cluster) /* no readahead */ + return 0; + toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; + if (!toff) /* first page is swap header */ + toff++, i--; + *offset = toff; + + swap_device_lock(swapdev); + do { + /* Don't read-ahead past the end of the swap area */ + if (toff >= swapdev->max) + break; + /* Don't read in free or bad pages */ + if (!swapdev->swap_map[toff]) + break; + if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + break; + toff++; + ret++; + } while (--i); + swap_device_unlock(swapdev); + return ret; +} diff --git a/uClinux-2.4.31-uc0/mm/vmalloc.c b/uClinux-2.4.31-uc0/mm/vmalloc.c new file mode 100644 index 0000000..4d2a93a --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/vmalloc.c @@ -0,0 +1,384 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/spinlock.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +rwlock_t vmlist_lock = RW_LOCK_UNLOCKED; +struct vm_struct * vmlist; + +static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + pte = pte_offset(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t page; + page = ptep_get_and_clear(pte); + address += PAGE_SIZE; + pte++; + if (pte_none(page)) + continue; + if (pte_present(page)) { + struct page *ptpage = pte_page(page); + if (VALID_PAGE(ptpage) && (!PageReserved(ptpage))) + __free_page(ptpage); + continue; + } + printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); + } while (address < end); +} + +static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + free_area_pte(pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +void vmfree_area_pages(unsigned long address, unsigned long size) +{ + pgd_t * dir; + unsigned long end = address + size; + + dir = pgd_offset_k(address); + flush_cache_all(); + do { + free_area_pmd(dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + flush_tlb_all(); +} + +static inline int alloc_area_pte (pte_t * pte, unsigned long address, + unsigned long size, int gfp_mask, + pgprot_t prot, struct page ***pages) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + struct page * page; + + if (!pages) { + spin_unlock(&init_mm.page_table_lock); + page = alloc_page(gfp_mask); + spin_lock(&init_mm.page_table_lock); + } else { + page = (**pages); + (*pages)++; + + /* Add a reference to the page so we can free later */ + if (page) + atomic_inc(&page->count); + + } + if (!pte_none(*pte)) + printk(KERN_ERR "alloc_area_pte: page already exists\n"); + if (!page) + return -ENOMEM; + set_pte(pte, mk_pte(page, prot)); + address += PAGE_SIZE; + pte++; + } while (address < end); + return 0; +} + +static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, + unsigned long size, int gfp_mask, + pgprot_t prot, struct page ***pages) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc(&init_mm, pmd, address); + if (!pte) + return -ENOMEM; + if (alloc_area_pte(pte, address, end - address, + gfp_mask, prot, pages)) + return -ENOMEM; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +static inline int __vmalloc_area_pages (unsigned long address, + unsigned long size, + int gfp_mask, + pgprot_t prot, + struct page ***pages) +{ + pgd_t * dir; + unsigned long start = address; + unsigned long end = address + size; + + dir = pgd_offset_k(address); + spin_lock(&init_mm.page_table_lock); + do { + pmd_t *pmd; + + pmd = pmd_alloc(&init_mm, dir, address); + if (!pmd) + goto err; + + if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages)) + goto err; // The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here + + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); + flush_cache_all(); + return 0; +err: + spin_unlock(&init_mm.page_table_lock); + flush_cache_all(); + if (address > start) + vmfree_area_pages(start, address - start); + return -ENOMEM; +} + +int vmalloc_area_pages(unsigned long address, unsigned long size, + int gfp_mask, pgprot_t prot) +{ + return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL); +} + +struct vm_struct * get_vm_area(unsigned long size, unsigned long flags) +{ + unsigned long addr, next; + struct vm_struct **p, *tmp, *area; + + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + size += PAGE_SIZE; + if (!size) { + kfree (area); + return NULL; + } + + addr = VMALLOC_START; + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + if ((size + addr) < addr) + goto out; + if (size + addr <= (unsigned long) tmp->addr) + break; + next = tmp->size + (unsigned long) tmp->addr; + if (next > addr) + addr = next; + if (addr > VMALLOC_END-size) + goto out; + } + area->flags = flags; + area->addr = (void *)addr; + area->size = size; + area->next = *p; + *p = area; + write_unlock(&vmlist_lock); + return area; + +out: + write_unlock(&vmlist_lock); + kfree(area); + return NULL; +} + +void __vfree(void * addr, int free_area_pages) +{ + struct vm_struct **p, *tmp; + + if (!addr) + return; + if ((PAGE_SIZE-1) & (unsigned long) addr) { + printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + return; + } + write_lock(&vmlist_lock); + for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + if (free_area_pages) + vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); + write_unlock(&vmlist_lock); + kfree(tmp); + return; + } + } + write_unlock(&vmlist_lock); + printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); +} + +void vfree(void * addr) +{ + __vfree(addr,1); +} + +void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot) +{ + void * addr; + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + area = get_vm_area(size, VM_ALLOC); + if (!area) + return NULL; + addr = area->addr; + if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, + prot, NULL)) { + __vfree(addr, 0); + return NULL; + } + return addr; +} + +void * vmap(struct page **pages, int count, + unsigned long flags, pgprot_t prot) +{ + void * addr; + struct vm_struct *area; + unsigned long size = count << PAGE_SHIFT; + + if (!size || size > (max_mapnr << PAGE_SHIFT)) + return NULL; + area = get_vm_area(size, flags); + if (!area) { + return NULL; + } + addr = area->addr; + if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0, + prot, &pages)) { + __vfree(addr, 0); + return NULL; + } + return addr; +} + +long vread(char *buf, char *addr, unsigned long count) +{ + struct vm_struct *tmp; + char *vaddr, *buf_start = buf; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + read_lock(&vmlist_lock); + for (tmp = vmlist; tmp; tmp = tmp->next) { + vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + *buf = '\0'; + buf++; + addr++; + count--; + } + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { + if (count == 0) + goto finished; + *buf = *addr; + buf++; + addr++; + count--; + } while (--n > 0); + } +finished: + read_unlock(&vmlist_lock); + return buf - buf_start; +} + +long vwrite(char *buf, char *addr, unsigned long count) +{ + struct vm_struct *tmp; + char *vaddr, *buf_start = buf; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + read_lock(&vmlist_lock); + for (tmp = vmlist; tmp; tmp = tmp->next) { + vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + buf++; + addr++; + count--; + } + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { + if (count == 0) + goto finished; + *addr = *buf; + buf++; + addr++; + count--; + } while (--n > 0); + } +finished: + read_unlock(&vmlist_lock); + return buf - buf_start; +} diff --git a/uClinux-2.4.31-uc0/mm/vmscan.c b/uClinux-2.4.31-uc0/mm/vmscan.c new file mode 100644 index 0000000..a517fd4 --- /dev/null +++ b/uClinux-2.4.31-uc0/mm/vmscan.c @@ -0,0 +1,858 @@ +/* + * linux/mm/vmscan.c + * + * The pageout daemon, decides which pages to evict (swap out) and + * does the actual work of freeing them. + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#include <linux/slab.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/smp_lock.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/file.h> + +#include <asm/pgalloc.h> + +/* + * "vm_passes" is the number of vm passes before failing the + * memory balancing. Take into account 3 passes are needed + * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio + * of the inactive list at each pass. + */ +int vm_passes = 60; + +/* + * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan + * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll + * scan 1/6 of the inactive lists during a normal aging round. + */ +int vm_cache_scan_ratio = 6; + +/* + * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier + * we'll start to pageout. + */ +int vm_mapped_ratio = 100; + +/* + * "vm_lru_balance_ratio" controls the balance between active and + * inactive cache. The bigger vm_balance is, the easier the + * active cache will grow, because we'll rotate the active list + * slowly. A value of 2 means we'll go towards a balance of + * 1/3 of the cache being inactive. + */ +int vm_lru_balance_ratio = 2; + +/* + * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan + * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of + * the unused-inode, dentry and dquot caches will be freed during a normal + * aging round. + */ +int vm_vfs_scan_ratio = 6; + +/* + * "vm_anon_lru" select if to immdiatly insert anon pages in the + * lru. Immediatly means as soon as they're allocated during the + * page faults. + * + * If this is set to 0, they're inserted only after the first + * swapout. + * + * Having anon pages immediatly inserted in the lru allows the + * VM to know better when it's worthwhile to start swapping + * anonymous ram, it will start to swap earlier and it should + * swap smoother and faster, but it will decrease scalability + * on the >16-ways of an order of magnitude. Big SMP/NUMA + * definitely can't take an hit on a global spinlock at + * every anon page allocation. So this is off by default. + * + * Low ram machines that swaps all the time want to turn + * this on (i.e. set to 1). + */ +int vm_anon_lru = 0; + +/* + * The swap-out function returns 1 if it successfully + * scanned all the pages it was asked to (`count'). + * It returns zero if it couldn't do anything, + * + * rss may decrease because pages are shared, but this + * doesn't count as having freed a page. + */ + +/* mm->page_table_lock is held. mmap_sem is not held */ +static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +{ + pte_t pte; + swp_entry_t entry; + + /* Don't look at this pte if it's been accessed recently. */ + if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { + mark_page_accessed(page); + return 0; + } + + /* Don't bother unmapping pages that are active */ + if (PageActive(page)) + return 0; + + /* Don't bother replenishing zones not under pressure.. */ + if (!memclass(page_zone(page), classzone)) + return 0; + + if (TryLockPage(page)) + return 0; + + /* From this point on, the odds are that we're going to + * nuke this pte, so read and clear the pte. This hook + * is needed on CPUs which update the accessed and dirty + * bits in hardware. + */ + flush_cache_page(vma, address); + pte = ptep_get_and_clear(page_table); + flush_tlb_page(vma, address); + + if (pte_dirty(pte)) + set_page_dirty(page); + + /* + * Is the page already in the swap cache? If so, then + * we can just drop our reference to it without doing + * any IO - it's already up-to-date on disk. + */ + if (PageSwapCache(page)) { + entry.val = page->index; + swap_duplicate(entry); +set_swap_pte: + set_pte(page_table, swp_entry_to_pte(entry)); +drop_pte: + mm->rss--; + UnlockPage(page); + { + int freeable = page_count(page) - !!page->buffers <= 2; + page_cache_release(page); + return freeable; + } + } + + /* + * Is it a clean page? Then it must be recoverable + * by just paging it in again, and we can just drop + * it.. or if it's dirty but has backing store, + * just mark the page dirty and drop it. + * + * However, this won't actually free any real + * memory, as the page will just be in the page cache + * somewhere, and as such we should just continue + * our scan. + * + * Basically, this just makes it possible for us to do + * some real work in the future in "refill_inactive()". + */ + if (page->mapping) + goto drop_pte; + if (!PageDirty(page)) + goto drop_pte; + + /* + * Anonymous buffercache pages can be left behind by + * concurrent truncate and pagefault. + */ + if (page->buffers) + goto preserve; + + /* + * This is a dirty, swappable page. First of all, + * get a suitable swap entry for it, and make sure + * we have the swap cache set up to associate the + * page with that swap entry. + */ + for (;;) { + entry = get_swap_page(); + if (!entry.val) + break; + /* Add it to the swap cache and mark it dirty + * (adding to the page cache will clear the dirty + * and uptodate bits, so we need to do it again) + */ + if (add_to_swap_cache(page, entry) == 0) { + SetPageUptodate(page); + set_page_dirty(page); + goto set_swap_pte; + } + /* Raced with "speculative" read_swap_cache_async */ + swap_free(entry); + } + + /* No swap space left */ +preserve: + set_pte(page_table, pte); + UnlockPage(page); + return 0; +} + +/* mm->page_table_lock is held. mmap_sem is not held */ +static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) +{ + pte_t * pte; + unsigned long pmd_end; + + if (pmd_none(*dir)) + return count; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return count; + } + + pte = pte_offset(dir, address); + + pmd_end = (address + PMD_SIZE) & PMD_MASK; + if (end > pmd_end) + end = pmd_end; + + do { + if (pte_present(*pte)) { + struct page *page = pte_page(*pte); + + if (VALID_PAGE(page) && !PageReserved(page)) { + count -= try_to_swap_out(mm, vma, address, pte, page, classzone); + if (!count) { + address += PAGE_SIZE; + break; + } + } + } + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + mm->swap_address = address; + return count; +} + +/* mm->page_table_lock is held. mmap_sem is not held */ +static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) +{ + pmd_t * pmd; + unsigned long pgd_end; + + if (pgd_none(*dir)) + return count; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return count; + } + + pmd = pmd_offset(dir, address); + + pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (pgd_end && (end > pgd_end)) + end = pgd_end; + + do { + count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); + if (!count) + break; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return count; +} + +/* mm->page_table_lock is held. mmap_sem is not held */ +static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) +{ + pgd_t *pgdir; + unsigned long end; + + /* Don't swap out areas which are reserved */ + if (vma->vm_flags & VM_RESERVED) + return count; + + pgdir = pgd_offset(mm, address); + + end = vma->vm_end; + BUG_ON(address >= end); + do { + count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); + if (!count) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (address && (address < end)); + return count; +} + +/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ +struct mm_struct *swap_mm = &init_mm; + +/* + * Returns remaining count of pages to be swapped out by followup call. + */ +static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) +{ + unsigned long address; + struct vm_area_struct* vma; + + /* + * Find the proper vm-area after freezing the vma chain + * and ptes. + */ + spin_lock(&mm->page_table_lock); + address = mm->swap_address; + if (address == TASK_SIZE || swap_mm != mm) { + /* We raced: don't count this mm but try again */ + ++*mmcounter; + goto out_unlock; + } + vma = find_vma(mm, address); + if (vma) { + if (address < vma->vm_start) + address = vma->vm_start; + + for (;;) { + count = swap_out_vma(mm, vma, address, count, classzone); + vma = vma->vm_next; + if (!vma) + break; + if (!count) + goto out_unlock; + address = vma->vm_start; + } + } + /* Indicate that we reached the end of address space */ + mm->swap_address = TASK_SIZE; + +out_unlock: + spin_unlock(&mm->page_table_lock); + return count; +} + +static int FASTCALL(swap_out(zone_t * classzone)); +static int fastcall swap_out(zone_t * classzone) +{ + int counter, nr_pages = SWAP_CLUSTER_MAX; + struct mm_struct *mm; + + counter = mmlist_nr << 1; + do { + if (unlikely(current->need_resched)) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&mmlist_lock); + mm = swap_mm; + while (mm->swap_address == TASK_SIZE || mm == &init_mm) { + mm->swap_address = 0; + mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); + if (mm == swap_mm) + goto empty; + swap_mm = mm; + } + + /* Make sure the mm doesn't disappear when we drop the lock.. */ + atomic_inc(&mm->mm_users); + spin_unlock(&mmlist_lock); + + nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); + + mmput(mm); + + if (!nr_pages) + return 1; + } while (--counter >= 0); + + return 0; + +empty: + spin_unlock(&mmlist_lock); + return 0; +} + +static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone)); +static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)); +static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout) +{ + struct list_head * entry; + int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio; + int max_mapped = vm_mapped_ratio * nr_pages; + + while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { + struct page * page; + + if (unlikely(current->need_resched)) { + spin_unlock(&pagemap_lru_lock); + __set_current_state(TASK_RUNNING); + schedule(); + spin_lock(&pagemap_lru_lock); + continue; + } + + page = list_entry(entry, struct page, lru); + + BUG_ON(!PageLRU(page)); + BUG_ON(PageActive(page)); + + list_del(entry); + list_add(entry, &inactive_list); + + /* + * Zero page counts can happen because we unlink the pages + * _after_ decrementing the usage count.. + */ + if (unlikely(!page_count(page))) + continue; + + if (!memclass(page_zone(page), classzone)) + continue; + + max_scan--; + + /* Racy check to avoid trylocking when not worthwhile */ + if (!page->buffers && (page_count(page) != 1 || !page->mapping)) + goto page_mapped; + + /* + * The page is locked. IO in progress? + * Move it to the back of the list. + */ + if (unlikely(TryLockPage(page))) { + if (PageLaunder(page) && (gfp_mask & __GFP_FS)) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + wait_on_page(page); + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + } + continue; + } + + if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { + /* + * It is not critical here to write it only if + * the page is unmapped beause any direct writer + * like O_DIRECT would set the PG_dirty bitflag + * on the phisical page after having successfully + * pinned it and after the I/O to the page is finished, + * so the direct writes to the page cannot get lost. + */ + int (*writepage)(struct page *); + + writepage = page->mapping->a_ops->writepage; + if ((gfp_mask & __GFP_FS) && writepage) { + ClearPageDirty(page); + SetPageLaunder(page); + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + + writepage(page); + page_cache_release(page); + + spin_lock(&pagemap_lru_lock); + continue; + } + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + */ + if (page->buffers) { + spin_unlock(&pagemap_lru_lock); + + /* avoid to free a locked page */ + page_cache_get(page); + + if (try_to_release_page(page, gfp_mask)) { + if (!page->mapping) { + /* + * We must not allow an anon page + * with no buffers to be visible on + * the LRU, so we unlock the page after + * taking the lru lock + */ + spin_lock(&pagemap_lru_lock); + UnlockPage(page); + __lru_cache_del(page); + + /* effectively free the page here */ + page_cache_release(page); + + if (--nr_pages) + continue; + break; + } else { + /* + * The page is still in pagecache so undo the stuff + * before the try_to_release_page since we've not + * finished and we can now try the next step. + */ + page_cache_release(page); + + spin_lock(&pagemap_lru_lock); + } + } else { + /* failed to drop the buffers so stop here */ + UnlockPage(page); + page_cache_release(page); + + spin_lock(&pagemap_lru_lock); + continue; + } + } + + spin_lock(&pagecache_lock); + + /* + * This is the non-racy check for busy page. + * It is critical to check PageDirty _after_ we made sure + * the page is freeable so not in use by anybody. + * At this point we're guaranteed that page->buffers is NULL, + * nobody can refill page->buffers under us because we still + * hold the page lock. + */ + if (!page->mapping || page_count(page) > 1) { + spin_unlock(&pagecache_lock); + UnlockPage(page); +page_mapped: + if (--max_mapped < 0) { + spin_unlock(&pagemap_lru_lock); + + nr_pages -= kmem_cache_reap(gfp_mask); + if (nr_pages <= 0) + goto out; + + shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask); + shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask); +#endif + + if (!*failed_swapout) + *failed_swapout = !swap_out(classzone); + + max_mapped = nr_pages * vm_mapped_ratio; + + spin_lock(&pagemap_lru_lock); + refill_inactive(nr_pages, classzone); + } + continue; + + } + if (PageDirty(page)) { + spin_unlock(&pagecache_lock); + UnlockPage(page); + continue; + } + + __lru_cache_del(page); + + /* point of no return */ + if (likely(!PageSwapCache(page))) { + __remove_inode_page(page); + spin_unlock(&pagecache_lock); + } else { + swp_entry_t swap; + swap.val = page->index; + __delete_from_swap_cache(page); + spin_unlock(&pagecache_lock); + swap_free(swap); + } + + UnlockPage(page); + + /* effectively free the page here */ + page_cache_release(page); + + if (--nr_pages) + continue; + break; + } + spin_unlock(&pagemap_lru_lock); + + out: + return nr_pages; +} + +/* + * This moves pages from the active list to + * the inactive list. + * + * We move them the other way when we see the + * reference bit on the page. + */ +static void fastcall refill_inactive(int nr_pages, zone_t * classzone) +{ + struct list_head * entry; + unsigned long ratio; + + ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1); + + entry = active_list.prev; + while (ratio && entry != &active_list) { + struct page * page; + + page = list_entry(entry, struct page, lru); + entry = entry->prev; + if (PageTestandClearReferenced(page)) { + list_del(&page->lru); + list_add(&page->lru, &active_list); + continue; + } + + ratio--; + + del_page_from_active_list(page); + add_page_to_inactive_list(page); + SetPageReferenced(page); + } + + if (entry != &active_list) { + list_del(&active_list); + list_add(&active_list, entry); + } +} + +static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)); +static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout) +{ + nr_pages -= kmem_cache_reap(gfp_mask); + if (nr_pages <= 0) + goto out; + + spin_lock(&pagemap_lru_lock); + refill_inactive(nr_pages, classzone); + + nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout); + +out: + return nr_pages; +} + +static int check_classzone_need_balance(zone_t * classzone); + +int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask) +{ + gfp_mask = pf_gfp_mask(gfp_mask); + + for (;;) { + int tries = vm_passes; + int failed_swapout = !(gfp_mask & __GFP_IO); + int nr_pages = SWAP_CLUSTER_MAX; + + do { + nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout); + if (nr_pages <= 0) + return 1; + shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask); + shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask); +#endif + if (!failed_swapout) + failed_swapout = !swap_out(classzone); + } while (--tries); + +#ifdef CONFIG_OOM_KILLER + out_of_memory(); +#else + if (likely(current->pid != 1)) + break; + if (!check_classzone_need_balance(classzone)) + break; + + __set_current_state(TASK_RUNNING); + yield(); +#endif + } + + return 0; +} + +int fastcall try_to_free_pages(unsigned int gfp_mask) +{ + pg_data_t *pgdat; + zonelist_t *zonelist; + unsigned long pf_free_pages; + int error = 0; + + pf_free_pages = current->flags & PF_FREE_PAGES; + current->flags &= ~PF_FREE_PAGES; + + for_each_pgdat(pgdat) { + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask); + } + + current->flags |= pf_free_pages; + return error; +} + +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); + +static int check_classzone_need_balance(zone_t * classzone) +{ + zone_t * first_zone; + int class_idx = zone_idx(classzone); + + first_zone = classzone->zone_pgdat->node_zones; + while (classzone >= first_zone) { + if (classzone->free_pages > classzone->watermarks[class_idx].high) + return 0; + classzone--; + } + return 1; +} + +static int kswapd_balance_pgdat(pg_data_t * pgdat) +{ + int need_more_balance = 0, i; + zone_t * zone; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (unlikely(current->need_resched)) + schedule(); + if (!zone->need_balance || !zone->size) + continue; + if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) { + zone->need_balance = 0; + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ*5); + continue; + } + if (check_classzone_need_balance(zone)) + need_more_balance = 1; + else + zone->need_balance = 0; + } + + return need_more_balance; +} + +static void kswapd_balance(void) +{ + int need_more_balance; + pg_data_t * pgdat; + + do { + need_more_balance = 0; + + for_each_pgdat(pgdat) + need_more_balance |= kswapd_balance_pgdat(pgdat); + } while (need_more_balance); +} + +static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) +{ + zone_t * zone; + int i; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!zone->need_balance || !zone->size) + continue; + return 0; + } + + return 1; +} + +static int kswapd_can_sleep(void) +{ + pg_data_t * pgdat; + + for_each_pgdat(pgdat) { + if (!kswapd_can_sleep_pgdat(pgdat)) + return 0; + } + + return 1; +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +int kswapd(void *unused) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + daemonize(); + strcpy(tsk->comm, "kswapd"); + sigfillset(&tsk->blocked); + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC; + + /* + * Kswapd main loop. + */ + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kswapd_wait, &wait); + + mb(); + if (kswapd_can_sleep()) + schedule(); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_wait, &wait); + + /* + * If we actually get into a low-memory situation, + * the processes needing more memory will wake us + * up on a more timely basis. + */ + kswapd_balance(); + run_task_queue(&tq_disk); + } +} + +static int __init kswapd_init(void) +{ + printk("Starting kswapd\n"); + swap_setup(); + kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + return 0; +} + +module_init(kswapd_init) |