slab.c - mm/slab.c - Linux source code 2.1.87pre1 - Bootlin Elixir Cross Referencer

/*
 * linux/mm/slab.c
 * Written by Mark Hemment, 1996/97.
 * (markhe@nextd.demon.co.uk)
 *
 * 11 April '97.  Started multi-threading - markhe
 *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
 *	The sem is only needed when accessing/extending the cache-chain, which
 *	can never happen inside an interrupt (kmem_cache_create(),
 *	kmem_cache_shrink() and kmem_cache_reap()).
 *	This is a medium-term exclusion lock.
 *
 *	Each cache has its own lock; 'c_spinlock'.  This lock is needed only
 *	when accessing non-constant members of a cache-struct.
 *	Note: 'constant members' are assigned a value in kmem_cache_create() before
 *	the cache is linked into the cache-chain.  The values never change, so not
 *	even a multi-reader lock is needed for these members.
 *	The c_spinlock is only ever held for a few cycles.
 *
 *	To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which
 *	maybe be sleeping and therefore not holding the semaphore/lock), the
 *	c_growing field is used.  This also prevents reaping from a cache.
 *
 *	Note, caches can _never_ be destroyed.  When a sub-system (eg module) has
 *	finished with a cache, it can only be shrunk.  This leaves the cache empty,
 *	but already enabled for re-use, eg. during a module re-load.
 *
 *	Notes:
 *		o Constructors/deconstructors are called while the cache-lock
 *		  is _not_ held.  Therefore they _must_ be threaded.
 *		o Constructors must not attempt to allocate memory from the
 *		  same cache that they are a constructor for - infinite loop!
 *		  (There is no easy way to trap this.)
 *		o The per-cache locks must be obtained with local-interrupts disabled.
 *		o When compiled with debug support, and an object-verify (upon release)
 *		  is request for a cache, the verify-function is called with the cache
 *		  lock held.  This helps debugging.
 *		o The functions called from try_to_free_page() must not attempt
 *		  to allocate memory from a cache which is being grown.
 *		  The buffer sub-system might try to allocate memory, via buffer_cachep.
 *		  As this pri is passed to the SLAB, and then (if necessary) onto the
 *		  gfp() funcs (which avoid calling try_to_free_page()), no deadlock
 *		  should happen.
 *
 *	The positioning of the per-cache lock is tricky.  If the lock is
 *	placed on the same h/w cache line as commonly accessed members
 *	the number of L1 cache-line faults is reduced.  However, this can
 *	lead to the cache-line ping-ponging between processors when the
 *	lock is in contention (and the common members are being accessed).
 *	Decided to keep it away from common members.
 *
 *	More fine-graining is possible, with per-slab locks...but this might be
 *	taking fine graining too far, but would have the advantage;
 *		During most allocs/frees no writes occur to the cache-struct.
 *		Therefore a multi-reader/one writer lock could be used (the writer
 *		needed when the slab chain is being link/unlinked).
 *		As we would not have an exclusion lock for the cache-structure, one
 *		would be needed per-slab (for updating s_free ptr, and/or the contents
 *		of s_index).
 *	The above locking would allow parallel operations to different slabs within
 *	the same cache with reduced spinning.
 *
 *	Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator),
 *	would allow most allocations from the same cache to execute in parallel.
 *
 *	At present, each engine can be growing a cache.  This should be blocked.
 *
 *	It is not currently 100% safe to examine the page_struct outside of a kernel
 *	or global cli lock.  The risk is v. small, and non-fatal.
 *
 *	Calls to printk() are not 100% safe (the function is not threaded).  However,
 *	printk() is only used under an error condition, and the risk is v. small (not
 *	sure if the console write functions 'enjoy' executing multiple contextes in
 *	parallel.  I guess they don't...).
 *	Note, for most calls to printk() any held cache-lock is dropped.  This is not
 *	always done for text size reasons - having *_unlock() everywhere is bloat.
 */

/*
 * An implementation of the Slab Allocator as described in outline in;
 *	UNIX Internals: The New Frontiers by Uresh Vahalia
 *	Pub: Prentice Hall	ISBN 0-13-101908-2
 * or with a little more detail in;
 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
 *	Jeff Bonwick (Sun Microsystems).
 *	Presented at: USENIX Summer 1994 Technical Conference
 */

/*
 * This implementation deviates from Bonwick's paper as it
 * does not use a hash-table for large objects, but rather a per slab
 * index to hold the bufctls.  This allows the bufctl structure to
 * be small (one word), but limits the number of objects a slab (not
 * a cache) can contain when off-slab bufctls are used.  The limit is the
 * size of the largest general-cache that does not use off-slab bufctls,
 * divided by the size of a bufctl.  For 32bit archs, is this 256/4 = 64.
 * This is not serious, as it is only for large objects, when it is unwise
 * to have too many per slab.
 * Note: This limit can be raised by introducing a general-cache whose size
 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 */

#include	<linux/__KEEPIDENTS__B.h>
#include	<linux/__KEEPIDENTS__C.h>
#include	<linux/__KEEPIDENTS__D.h>
#include	<linux/__KEEPIDENTS__E.h>
#include	<linux/__KEEPIDENTS__F.h>
#include	<linux/__KEEPIDENTS__G.h>

#include	<asm/system.h>
#include	<asm/atomic.h>
#include	<asm/smp_lock.h>
#include	<asm/spinlock.h>

/* If there is a different PAGE_SIZE around, and it works with this allocator,
 * then change the following.
 */
#if	(PAGE_SIZE != 8192 && PAGE_SIZE != 4096)
#error	Your page size is probably not correctly supported - please check
#endif

/* SLAB_MGMT_CHECKS	- 1 to enable extra checks in kmem_cache_create().
 *			  0 if you wish to reduce memory usage.
 *
 * SLAB_DEBUG_SUPPORT	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
 *			  SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON.
 *			  0 for faster, smaller, code (espically in the critical paths).
 *
 * SLAB_STATS		- 1 to collect stats for /proc/slabinfo.
 *			  0 for faster, smaller, code (espically in the critical paths).
 *
 * SLAB_SELFTEST	- 1 to perform a few tests, mainly for developement.
 */
#define		SLAB_MGMT_CHECKS	1
#define		SLAB_DEBUG_SUPPORT	0
#define		SLAB_STATS		0
#define		SLAB_SELFTEST		0

/* Shouldn't this be in a header file somewhere? */
#define	BYTES_PER_WORD		sizeof(void *)

/* Legal flag mask for kmem_cache_create(). */
#if	SLAB_DEBUG_SUPPORT
#if	0
#define	SLAB_C_MASK		(SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
				 SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \
				 SLAB_HIGH_PACK)
#endif
#define	SLAB_C_MASK		(SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
				 SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
#else
#if	0
#define	SLAB_C_MASK		(SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK)
#endif
#define	SLAB_C_MASK		(SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
#endif	/* SLAB_DEBUG_SUPPORT */

/* Slab management struct.
 * Manages the objs in a slab.  Placed either at the end of mem allocated
 * for a slab, or from an internal obj cache (cache_slabp).
 * Slabs are chained into a partially ordered list; fully used first, partial
 * next, and then fully free slabs.
 * The first 4 members are referenced during an alloc/free operation, and
 * should always appear on the same cache line.
 * Note: The offset between some members _must_ match offsets within
 * the kmem_cache_t - see kmem_cache_init() for the checks. */

#define	SLAB_OFFSET_BITS	16	/* could make this larger for 64bit archs */

typedef struct kmem_slab_s {
	struct kmem_bufctl_s	*s_freep;  /* ptr to first inactive obj in slab */
	struct kmem_bufctl_s	*s_index;
	unsigned long		 s_magic;
	unsigned long		 s_inuse;  /* num of objs active in slab */

	struct kmem_slab_s	*s_nextp;
	struct kmem_slab_s	*s_prevp;
	void			*s_mem;	   /* addr of first obj in slab */
	unsigned long		 s_offset:SLAB_OFFSET_BITS,
				 s_dma:1;
} kmem_slab_t;

/* When the slab mgmt is on-slab, this gives the size to use. */
#define	slab_align_size		(L1_CACHE_ALIGN(sizeof(kmem_slab_t)))

/* Test for end of slab chain. */
#define	kmem_slab_end(x)	((kmem_slab_t*)&((x)->c_offset))

/* s_magic */
#define	SLAB_MAGIC_ALLOC	0xA5C32F2BUL	/* slab is alive */
#define	SLAB_MAGIC_DESTROYED	0xB2F23C5AUL	/* slab has been destoryed */

/* Bufctl's are used for linking objs within a slab, identifying what slab an obj
 * is in, and the address of the associated obj (for sanity checking with off-slab
 * bufctls).  What a bufctl contains depends upon the state of the obj and
 * the organisation of the cache.
 */
typedef struct kmem_bufctl_s {
	union {
		struct kmem_bufctl_s	*buf_nextp;
		kmem_slab_t		*buf_slabp;	/* slab for obj */
		void *			 buf_objp;
	} u;
} kmem_bufctl_t;

/* ...shorthand... */
#define	buf_nextp	u.buf_nextp
#define	buf_slabp	u.buf_slabp
#define	buf_objp	u.buf_objp

#if	SLAB_DEBUG_SUPPORT
/* Magic nums for obj red zoning.
 * Placed in the first word before and the first word after an obj.
 */
#define	SLAB_RED_MAGIC1		0x5A2CF071UL	/* when obj is active */
#define	SLAB_RED_MAGIC2		0x170FC2A5UL	/* when obj is inactive */

/* ...and for poisoning */
#define	SLAB_POISON_BYTE	0x5a		/* byte value for poisoning */
#define	SLAB_POISON_END	0xa5		/* end-byte of poisoning */

#endif	/* SLAB_DEBUG_SUPPORT */

/* Cache struct - manages a cache.
 * First four members are commonly referenced during an alloc/free operation.
 */
struct kmem_cache_s {
	kmem_slab_t		 *c_freep;	/* first slab w. free objs */
	unsigned long	 	  c_flags;	/* constant flags */
	unsigned long		  c_offset;
	unsigned long		  c_num;	/* # of objs per slab */

	unsigned long		  c_magic;
	unsigned long		  c_inuse;	/* kept at zero */
	kmem_slab_t		 *c_firstp;	/* first slab in chain */
	kmem_slab_t		 *c_lastp;	/* last slab in chain */

	spinlock_t		  c_spinlock;
	unsigned long		  c_growing;
	unsigned long		  c_dflags;	/* dynamic flags */
	size_t 			  c_org_size;
	unsigned long		  c_gfporder;	/* order of pgs per slab (2^n) */
	void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */
	void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */
	unsigned long		  c_align;	/* alignment of objs */
	size_t			  c_colour;	/* cache colouring range */
	size_t			  c_colour_next;/* cache colouring */
	unsigned long		  c_failures;
	const char		 *c_name;
	struct kmem_cache_s	 *c_nextp;
	kmem_cache_t		 *c_index_cachep;
#if	SLAB_STATS
	unsigned long		  c_num_active;
	unsigned long		  c_num_allocations;
	unsigned long		  c_high_mark;
	unsigned long		  c_grown;
	unsigned long		  c_reaped;
	atomic_t 		  c_errors;
#endif	/* SLAB_STATS */
};

/* internal c_flags */
#define	SLAB_CFLGS_OFF_SLAB	0x010000UL	/* slab mgmt in own cache */
#define	SLAB_CFLGS_BUFCTL	0x020000UL	/* bufctls in own cache */
#define	SLAB_CFLGS_GENERAL	0x080000UL	/* a general-cache */

/* c_dflags (dynamic flags).  Need to hold the spinlock to access this member */
#define	SLAB_CFLGS_GROWN	0x000002UL	/* don't reap a recently grown */

#define	SLAB_OFF_SLAB(x)	((x) & SLAB_CFLGS_OFF_SLAB)
#define	SLAB_BUFCTL(x)		((x) & SLAB_CFLGS_BUFCTL)
#define	SLAB_GROWN(x)		((x) & SLAB_CFLGS_GROWN)

#if	SLAB_STATS
#define	SLAB_STATS_INC_ACTIVE(x)	((x)->c_num_active++)
#define	SLAB_STATS_DEC_ACTIVE(x)	((x)->c_num_active--)
#define	SLAB_STATS_INC_ALLOCED(x)	((x)->c_num_allocations++)
#define	SLAB_STATS_INC_GROWN(x)		((x)->c_grown++)
#define	SLAB_STATS_INC_REAPED(x)	((x)->c_reaped++)
#define	SLAB_STATS_SET_HIGH(x)		do { if ((x)->c_num_active > (x)->c_high_mark) \
						(x)->c_high_mark = (x)->c_num_active; \
					} while (0)
#define	SLAB_STATS_INC_ERR(x)		(atomic_inc(&(x)->c_errors))
#else
#define	SLAB_STATS_INC_ACTIVE(x)
#define	SLAB_STATS_DEC_ACTIVE(x)
#define	SLAB_STATS_INC_ALLOCED(x)
#define	SLAB_STATS_INC_GROWN(x)
#define	SLAB_STATS_INC_REAPED(x)
#define	SLAB_STATS_SET_HIGH(x)
#define	SLAB_STATS_INC_ERR(x)
#endif	/* SLAB_STATS */

#if	SLAB_SELFTEST
#if	!SLAB_DEBUG_SUPPORT
#error	Debug support needed for self-test
#endif
static void kmem_self_test(void);
#endif	/* SLAB_SELFTEST */

/* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */
#define	SLAB_C_MAGIC		0x4F17A36DUL

/* maximum size of an obj (in 2^order pages) */
#define	SLAB_OBJ_MAX_ORDER	5	/* 32 pages */

/* maximum num of pages for a slab (prevents large requests to the VM layer) */
#define	SLAB_MAX_GFP_ORDER	5	/* 32 pages */

/* the 'prefered' minimum num of objs per slab - maybe less for large objs */
#define	SLAB_MIN_OBJS_PER_SLAB	4

/* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
 * then the page order must be less than this before trying the next order.
 */
#define	SLAB_BREAK_GFP_ORDER	2

/* Macros for storing/retrieving the cachep and or slab from the
 * global 'mem_map'.  With off-slab bufctls, these are used to find the
 * slab an obj belongs to.  With kmalloc(), and kfree(), these are used
 * to find the cache which an obj belongs to.
 */
#define	SLAB_SET_PAGE_CACHE(pg, x)	((pg)->next = (struct page *)(x))
#define	SLAB_GET_PAGE_CACHE(pg)		((kmem_cache_t *)(pg)->next)
#define	SLAB_SET_PAGE_SLAB(pg, x)	((pg)->prev = (struct page *)(x))
#define	SLAB_GET_PAGE_SLAB(pg)		((kmem_slab_t *)(pg)->prev)

/* Size description struct for general-caches. */
typedef struct cache_sizes {
	size_t		 cs_size;
	kmem_cache_t	*cs_cachep;
} cache_sizes_t;

static cache_sizes_t cache_sizes[] = {
#if	PAGE_SIZE == 4096
	{  32,		NULL},
#endif
	{  64,		NULL},
	{ 128,		NULL},
	{ 256,		NULL},
	{ 512,		NULL},
	{1024,		NULL},
	{2048,		NULL},
	{4096,		NULL},
	{8192,		NULL},
	{16384,		NULL},
	{32768,		NULL},
	{65536,		NULL},
	{131072,	NULL},
	{0,		NULL}
};

/* Names for the general-caches.  Not placed into the sizes struct for
 * a good reason; the string ptr is not needed while searching in kmalloc(),
 * and would 'get-in-the-way' in the h/w cache.
 */
static char *cache_sizes_name[] = {
#if	PAGE_SIZE == 4096
	"size-32",
#endif
	"size-64",
	"size-128",
	"size-256",
	"size-512",
	"size-1024",
	"size-2048",
	"size-4096",
	"size-8192",
	"size-16384",
	"size-32768",
	"size-65536",
	"size-131072"
};

/* internal cache of cache description objs */
static	kmem_cache_t	cache_cache = {
/* freep, flags */		kmem_slab_end(&cache_cache), SLAB_NO_REAP,
/* offset, num */		sizeof(kmem_cache_t),	0,
/* c_magic, c_inuse */		SLAB_C_MAGIC, 0,
/* firstp, lastp */		kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache),
/* spinlock */			SPIN_LOCK_UNLOCKED,
/* growing */			0,
/* dflags */			0,
/* org_size, gfp */		0, 0,
/* ctor, dtor, align */		NULL, NULL, L1_CACHE_BYTES,
/* colour, colour_next */	0, 0,
/* failures */			0,
/* name */			"kmem_cache",
/* nextp */			&cache_cache,
/* index */			NULL,
};

/* Guard access to the cache-chain. */
static struct semaphore	cache_chain_sem;

/* Place maintainer for reaping. */
static	kmem_cache_t	*clock_searchp = &cache_cache;

/* Internal slab mgmt cache, for when slab mgmt is off-slab. */
static kmem_cache_t	*cache_slabp = NULL;

/* Max number of objs-per-slab for caches which use bufctl's.
 * Needed to avoid a possible looping condition in kmem_cache_grow().
 */
static unsigned long bufctl_limit = 0;

/* Initialisation - setup the `cache' cache. */
__initfunc(long kmem_cache_init(long start, long end))
{
	size_t size, i;

#define	kmem_slab_offset(x)  ((unsigned long)&((kmem_slab_t *)0)->x)
#define kmem_slab_diff(a,b)  (kmem_slab_offset(a) - kmem_slab_offset(b))
#define	kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x)
#define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b))

	/* Sanity checks... */
	if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) ||
	    kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) ||
	    ((kmem_cache_offset(c_lastp) -
	      ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) !=
	     kmem_slab_offset(s_prevp)) ||
	    kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) {
		/* Offsets to the magic are incorrect, either the structures have
		 * been incorrectly changed, or adjustments are needed for your
		 * architecture.
		 */
		panic("kmem_cache_init(): Offsets are wrong - I've been messed with!");
		/* NOTREACHED */
	}
#undef	kmem_cache_offset
#undef	kmem_cache_diff
#undef	kmem_slab_offset
#undef	kmem_slab_diff

	cache_chain_sem = MUTEX;

	size = cache_cache.c_offset + sizeof(kmem_bufctl_t);
	size += (L1_CACHE_BYTES-1);
	size &= ~(L1_CACHE_BYTES-1);
	cache_cache.c_offset = size-sizeof(kmem_bufctl_t);
	
	i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size;
	cache_cache.c_num = i / size;	/* num of objs per slab */

	/* Cache colouring. */
	cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
	cache_cache.c_colour_next = cache_cache.c_colour;

	return start;
}

/* Initialisation - setup remaining internal and general caches.
 * Called after the gfp() functions have been enabled, and before smp_init().
 */
__initfunc(void kmem_cache_sizes_init(void))
{
	unsigned int	found = 0;

	cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t),
					0, SLAB_HWCACHE_ALIGN, NULL, NULL);
	if (cache_slabp) {
		char **names = cache_sizes_name;
		cache_sizes_t *sizes = cache_sizes;
		do {
			/* For performance, all the general-caches are L1 aligned.
			 * This should be particularly beneficial on SMP boxes, as it
			 * elimantes "false sharing".
			 * Note for systems short on memory removing the alignment will
			 * allow tighter packing of the smaller caches. */
			if (!(sizes->cs_cachep =
			      kmem_cache_create(*names++, sizes->cs_size,
						0, SLAB_HWCACHE_ALIGN, NULL, NULL)))
				goto panic_time;
			if (!found) {
				/* Inc off-slab bufctl limit until the ceiling is hit. */
				if (SLAB_BUFCTL(sizes->cs_cachep->c_flags))
					found++;
				else
					bufctl_limit =
						(sizes->cs_size/sizeof(kmem_bufctl_t));
			}
			sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL;
			sizes++;
		} while (sizes->cs_size);
#if	SLAB_SELFTEST
		kmem_self_test();
#endif	/* SLAB_SELFTEST */
		return;
	}
panic_time:
	panic("kmem_cache_sizes_init: Error creating caches");
	/* NOTREACHED */
}

/* Interface to system's page allocator.  Dma pts to non-zero if all
 * of memory is DMAable. No need to hold the cache-lock.
 */
static inline void *
kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
{
	void	*addr;

	*dma = flags & SLAB_DMA;
	addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
	/* Assume that now we have the pages no one else can legally
	 * messes with the 'struct page's.
	 * However vm_scan() might try to test the structure to see if
	 * it is a named-page or buffer-page.  The members it tests are
	 * of no interest here.....
	 */
	if (!*dma && addr) {
		/* Need to check if can dma. */
		struct page *page = mem_map + MAP_NR(addr);
		*dma = 1<<cachep->c_gfporder;
		while ((*dma)--) {
			if (!PageDMA(page)) {
				*dma = 0;
				break;
			}
			page++;
		}
	}
	return addr;
}

/* Interface to system's page release. */
static inline void
kmem_freepages(kmem_cache_t *cachep, void *addr)
{
	unsigned long i = (1<<cachep->c_gfporder);
	struct page *page = &mem_map[MAP_NR(addr)];

	/* free_pages() does not clear the type bit - we do that.
	 * The pages have been unlinked from their cache-slab,
	 * but their 'struct page's might be accessed in
	 * vm_scan(). Shouldn't be a worry.
	 */
	while (i--) {
		PageClearSlab(page);
		page++;
	}
	free_pages((unsigned long)addr, cachep->c_gfporder); 
}

#if	SLAB_DEBUG_SUPPORT
static inline void
kmem_poison_obj(kmem_cache_t *cachep, void *addr)
{
	memset(addr, SLAB_POISON_BYTE, cachep->c_org_size);
	*(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISON_END;
}

static inline int
kmem_check_poison_obj(kmem_cache_t *cachep, void *addr)
{
	void *end;
	end = memchr(addr, SLAB_POISON_END, cachep->c_org_size);
	if (end != (addr+cachep->c_org_size-1))
		return 1;
	return 0;
}
#endif	/* SLAB_DEBUG_SUPPORT */

/* Three slab chain funcs - all called with ints disabled and the appropiate
 * cache-lock held.
 */
static inline void
kmem_slab_unlink(kmem_slab_t *slabp)
{
	kmem_slab_t	*prevp = slabp->s_prevp;
	kmem_slab_t	*nextp = slabp->s_nextp;
	prevp->s_nextp = nextp;
	nextp->s_prevp = prevp;
}

static inline void 
kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp)
{
	kmem_slab_t	*lastp = cachep->c_lastp;
	slabp->s_nextp = kmem_slab_end(cachep);
	slabp->s_prevp = lastp;
	cachep->c_lastp = slabp;
	lastp->s_nextp = slabp;
}

static inline void
kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
{
	kmem_slab_t	*nextp = cachep->c_freep;
	kmem_slab_t	*prevp = nextp->s_prevp;
	slabp->s_nextp = nextp;
	slabp->s_prevp = prevp;
	nextp->s_prevp = slabp;
	slabp->s_prevp->s_nextp = slabp;
}

/* Destroy all the objs in a slab, and release the mem back to the system.
 * Before calling the slab must have been unlinked from the cache.
 * The cache-lock is not held/needed.
 */
static void
kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
{
	if (cachep->c_dtor
#if	SLAB_DEBUG_SUPPORT
		|| cachep->c_flags & (SLAB_POISON || SLAB_RED_ZONE)
#endif	/*SLAB_DEBUG_SUPPORT*/
	) {
		/* Doesn't use the bufctl ptrs to find objs. */
		unsigned long num = cachep->c_num;
		void *objp = slabp->s_mem;
		do {
#if	SLAB_DEBUG_SUPPORT
			if (cachep->c_flags & SLAB_RED_ZONE) {
				if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1)
					printk(KERN_ERR "kmem_slab_destroy: "
					       "Bad front redzone - %s\n",
					       cachep->c_name);
				objp += BYTES_PER_WORD;
				if (*((unsigned long*)(objp+cachep->c_org_size)) !=
				    SLAB_RED_MAGIC1)
					printk(KERN_ERR "kmem_slab_destroy: "
					       "Bad rear redzone - %s\n",
					       cachep->c_name);
			}
			if (cachep->c_dtor)
#endif	/*SLAB_DEBUG_SUPPORT*/
				(cachep->c_dtor)(objp, cachep, 0);
#if	SLAB_DEBUG_SUPPORT
			else if (cachep->c_flags & SLAB_POISON) {
				if (kmem_check_poison_obj(cachep, objp))
					printk(KERN_ERR "kmem_slab_destory: "
					       "Bad poison - %s\n", cachep->c_name);
			}
			if (cachep->c_flags & SLAB_RED_ZONE)
				objp -= BYTES_PER_WORD;
#endif	/* SLAB_DEBUG_SUPPORT */
			objp += cachep->c_offset;
			if (!slabp->s_index)
				objp += sizeof(kmem_bufctl_t);
		} while (--num);
	}

	slabp->s_magic = SLAB_MAGIC_DESTROYED;
	kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
	if (slabp->s_index)
		kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
	if (SLAB_OFF_SLAB(cachep->c_flags))
		kmem_cache_free(cache_slabp, slabp);
}

/* Cal the num objs, wastage, and bytes left over for a given slab size. */
static inline size_t
kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra,
		     unsigned long flags, size_t *left_over, unsigned long *num)
{
	size_t wastage = PAGE_SIZE<<gfporder;

	if (SLAB_OFF_SLAB(flags))
		gfporder = 0;
	else
		gfporder = slab_align_size;
	wastage -= gfporder;
	*num = wastage / size;
	wastage -= (*num * size);
	*left_over = wastage;

	return (wastage + gfporder + (extra * *num));
}

/* Create a cache:
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * NOTE: The 'name' is assumed to be memory that is _not_  going to disappear.
 */
kmem_cache_t *
kmem_cache_create(const char *name, size_t size, size_t offset,
	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
	void (*dtor)(void*, kmem_cache_t *, unsigned long))
{
	const char *func_nm= KERN_ERR "kmem_create: ";
	kmem_cache_t	*searchp;
	kmem_cache_t	*cachep=NULL;
	size_t		extra;
	size_t		left_over;
	size_t		align;

	/* Sanity checks... */
#if	SLAB_MGMT_CHECKS
	if (!name) {
		printk("%sNULL ptr\n", func_nm);
		goto opps;
	}
	if (in_interrupt()) {
		printk("%sCalled during int - %s\n", func_nm, name);
		goto opps;
	}

	if (size < BYTES_PER_WORD) {
		printk("%sSize too small %d - %s\n", func_nm, (int) size, name);
		size = BYTES_PER_WORD;
	}

	if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) {
		printk("%sSize too large %d - %s\n", func_nm, (int) size, name);
		goto opps;
	}

	if (dtor && !ctor) {
		/* Decon, but no con - doesn't make sense */
		printk("%sDecon but no con - %s\n", func_nm, name);
		goto opps;
	}

	if (offset < 0 || offset > size) {
		printk("%sOffset weired %d - %s\n", func_nm, (int) offset, name);
		offset = 0;
	}

#if	SLAB_DEBUG_SUPPORT
	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
		/* No constructor, but inital state check requested */
		printk("%sNo con, but init state check requested - %s\n", func_nm, name);
		flags &= ~SLAB_DEBUG_INITIAL;
	}

	if ((flags & SLAB_POISON) && ctor) {
		/* request for poisoning, but we can't do that with a constructor */
		printk("%sPoisoning requested, but con given - %s\n", func_nm, name);
		flags &= ~SLAB_POISON;
	}
#if	0
	if ((flags & SLAB_HIGH_PACK) && ctor) {
		printk("%sHigh pack requested, but con given - %s\n", func_nm, name);
		flags &= ~SLAB_HIGH_PACK;
	}
	if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISON|SLAB_RED_ZONE))) {
		printk("%sHigh pack requested, but with poisoning/red-zoning - %s\n",
		       func_nm, name);
		flags &= ~SLAB_HIGH_PACK;
	}
#endif
#endif	/* SLAB_DEBUG_SUPPORT */
#endif	/* SLAB_MGMT_CHECKS */

	/* Always checks flags, a caller might be expecting debug
	 * support which isn't available.
	 */
	if (flags & ~SLAB_C_MASK) {
		printk("%sIllgl flg %lX - %s\n", func_nm, flags, name);
		flags &= SLAB_C_MASK;
	}

	/* Get cache's description obj. */
	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
	if (!cachep)
		goto opps;
	memset(cachep, 0, sizeof(kmem_cache_t));

	/* Check that size is in terms of words.  This is needed to avoid
	 * unaligned accesses for some archs when redzoning is used, and makes
	 * sure any on-slab bufctl's are also correctly aligned.
	 */
	if (size & (BYTES_PER_WORD-1)) {
		size += (BYTES_PER_WORD-1);
		size &= ~(BYTES_PER_WORD-1);
		printk("%sForcing size word alignment - %s\n", func_nm, name);
	}

	cachep->c_org_size = size;
#if	SLAB_DEBUG_SUPPORT
	if (flags & SLAB_RED_ZONE) {
		/* There is no point trying to honour cache alignment when redzoning. */
		flags &= ~SLAB_HWCACHE_ALIGN;
		size += 2*BYTES_PER_WORD;		/* words for redzone */
	}
#endif	/* SLAB_DEBUG_SUPPORT */

	align = BYTES_PER_WORD;
	if (flags & SLAB_HWCACHE_ALIGN)
		align = L1_CACHE_BYTES;

	/* Determine if the slab mgmt and/or bufclts are 'on' or 'off' slab. */
	extra = sizeof(kmem_bufctl_t);
	if (size < (PAGE_SIZE>>3)) {
		/* Size is small(ish).  Use packing where bufctl size per
		 * obj is low, and slab mngmnt is on-slab.
		 */
#if	0
		if ((flags & SLAB_HIGH_PACK)) {
			/* Special high packing for small objects
			 * (mainly for vm_mapping structs, but
			 * others can use it).
			 */
			if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) ||
			    size == L1_CACHE_BYTES) {
				/* The bufctl is stored with the object. */
				extra = 0;
			} else
				flags &= ~SLAB_HIGH_PACK;
		}
#endif
	} else {
		/* Size is large, assume best to place the slab mngmnt obj
		 * off-slab (should allow better packing of objs).
		 */
		flags |= SLAB_CFLGS_OFF_SLAB;
		if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2)
		    || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) {
			/* To avoid waste the bufctls are off-slab... */
			flags |= SLAB_CFLGS_BUFCTL;
			extra = 0;
		} /* else slab mngmnt is off-slab, but freelist ptrs are on. */
	}
	size += extra;

	if (flags & SLAB_HWCACHE_ALIGN) {
		/* Need to adjust size so that objs are cache aligned. */
		if (size > (L1_CACHE_BYTES/2)) {
			size_t words = size % L1_CACHE_BYTES;
			if (words)
				size += (L1_CACHE_BYTES-words);
		} else {
			/* Small obj size, can get at least two per cache line. */
			int num_per_line = L1_CACHE_BYTES/size;
			left_over = L1_CACHE_BYTES - (num_per_line*size);
			if (left_over) {
				/* Need to adjust size so objs cache align. */
				if (left_over%num_per_line) {
					/* Odd num of objs per line - fixup. */
					num_per_line--;
					left_over += size;
				}
				size += (left_over/num_per_line);
			}
		}
	} else if (!(size%L1_CACHE_BYTES)) {
		/* Size happens to cache align... */
		flags |= SLAB_HWCACHE_ALIGN;
		align = L1_CACHE_BYTES;
	}

	/* Cal size (in pages) of slabs, and the num of objs per slab.
	 * This could be made much more intelligent.  For now, try to avoid
	 * using high page-orders for slabs.  When the gfp() funcs are more
	 * friendly towards high-order requests, this should be changed.
	 */
	do {
		size_t wastage;
		unsigned int break_flag = 0;
cal_wastage:
		wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra,
					       flags, &left_over, &cachep->c_num);
		if (!cachep->c_num)
			goto next;
		if (break_flag)
			break;
		if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) {
			/* Oops, this num of objs will cause problems. */
			cachep->c_gfporder--;
			break_flag++;
			goto cal_wastage;
		}
		if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER)
			break;

		/* Large num of objs is good, but v. large slabs are currently
		 * bad for the gfp()s.
		 */
		if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
			if (cachep->c_gfporder < SLAB_BREAK_GFP_ORDER)
				goto next;
		}

		/* Stop caches with small objs having a large num of pages. */
		if (left_over <= slab_align_size)
			break;
		if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder))
			break;	/* Acceptable internal fragmentation. */
next:
		cachep->c_gfporder++;
	} while (1);

	/* If the slab has been placed off-slab, and we have enough space then
	 * move it on-slab.  This is at the expense of any extra colouring.
	 */
	if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) &&
	    left_over >= slab_align_size) {
		flags &= ~SLAB_CFLGS_OFF_SLAB;
		left_over -= slab_align_size;
	}

	/* Offset must be a factor of the alignment. */
	offset += (align-1);
	offset &= ~(align-1);

	/* Mess around with the offset alignment. */
	if (!left_over) {
		offset = 0;
	} else if (left_over < offset) {
		offset = align;
		if (flags & SLAB_HWCACHE_ALIGN) {
			if (left_over < offset)
				offset = 0;
		} else {
			/* Offset is BYTES_PER_WORD, and left_over is at
			 * least BYTES_PER_WORD.
			 */
			if (left_over >= (BYTES_PER_WORD*2)) {
				offset >>= 1;
				if (left_over >= (BYTES_PER_WORD*4))
					offset >>= 1;
			}
		}
	} else if (!offset) {
		/* No offset requested, but space enough - give one. */
		offset = left_over/align;
		if (flags & SLAB_HWCACHE_ALIGN) {
			if (offset >= 8) {
				/* A large number of colours - use a larger alignment. */
				align <<= 1;
			}
		} else {
			if (offset >= 10) {
				align <<= 1;
				if (offset >= 16)
					align <<= 1;
			}
		}
		offset = align;
	}

#if	0
printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size);
#endif

	if ((cachep->c_align = (unsigned long) offset))
		cachep->c_colour = (left_over/offset);
	cachep->c_colour_next = cachep->c_colour;

	/* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */
	if (!SLAB_BUFCTL(flags))
		size -= sizeof(kmem_bufctl_t);
	else
		cachep->c_index_cachep =
			kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t));
	cachep->c_offset = (unsigned long) size;
	cachep->c_freep = kmem_slab_end(cachep);
	cachep->c_firstp = kmem_slab_end(cachep);
	cachep->c_lastp = kmem_slab_end(cachep);
	cachep->c_flags = flags;
	cachep->c_ctor = ctor;
	cachep->c_dtor = dtor;
	cachep->c_magic = SLAB_C_MAGIC;
	cachep->c_name = name;		/* Simply point to the name. */
	spin_lock_init(&cachep->c_spinlock);

	/* Need the semaphore to access the chain. */
	down(&cache_chain_sem);
	searchp = &cache_cache;
	do {
		/* The name field is constant - no lock needed. */
		if (!strcmp(searchp->c_name, name)) {
			printk("%sDup name - %s\n", func_nm, name);
			break;
		}
		searchp = searchp->c_nextp;
	} while (searchp != &cache_cache);

	/* There is no reason to lock our new cache before we
	 * link it in - no one knows about it yet...
	 */
	cachep->c_nextp = cache_cache.c_nextp;
	cache_cache.c_nextp = cachep;
	up(&cache_chain_sem);
opps:
	return cachep;
}

/* Shrink a cache.  Releases as many slabs as possible for a cache.
 * It is expected this function will be called by a module when it is
 * unloaded.  The cache is _not_ removed, this creates too many problems and
 * the cache-structure does not take up much room.  A module should keep its
 * cache pointer(s) in unloaded memory, so when reloaded it knows the cache
 * is available.  To help debugging, a zero exit status indicates all slabs
 * were released.
 */
int
kmem_cache_shrink(kmem_cache_t *cachep)
{
	kmem_cache_t	*searchp;
	kmem_slab_t	*slabp;
	int	ret;

	if (!cachep) {
		printk(KERN_ERR "kmem_shrink: NULL ptr\n");
		return 2;
	}
	if (in_interrupt()) {
		printk(KERN_ERR "kmem_shrink: Called during int - %s\n", cachep->c_name);
		return 2;
	}

	/* Find the cache in the chain of caches. */
	down(&cache_chain_sem);		/* Semaphore is needed. */
	searchp = &cache_cache;
	for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) {
		if (searchp->c_nextp != cachep)
			continue;

		/* Accessing clock_searchp is safe - we hold the mutex. */
		if (cachep == clock_searchp)
			clock_searchp = cachep->c_nextp;
		goto found;
	}
	up(&cache_chain_sem);
	printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
	return 2;
found:
	/* Relase the sempahore before getting the cache-lock.  This could
	 * mean multiple engines are shrinking the cache, but so what...
	 */
	up(&cache_chain_sem);
	spin_lock_irq(&cachep->c_spinlock);

	/* If the cache is growing, stop shrinking. */
	while (!cachep->c_growing) {
		slabp = cachep->c_lastp;
		if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
			break;
		kmem_slab_unlink(slabp);
		spin_unlock_irq(&cachep->c_spinlock);
		kmem_slab_destroy(cachep, slabp);
		spin_lock_irq(&cachep->c_spinlock);
	}
	ret = 1;
	if (cachep->c_lastp == kmem_slab_end(cachep))
		ret--;		/* Cache is empty. */
	spin_unlock_irq(&cachep->c_spinlock);
	return ret;
}

/* Get the mem for a slab mgmt obj. */
static inline kmem_slab_t *
kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
{
	kmem_slab_t	*slabp;

	if (SLAB_OFF_SLAB(cachep->c_flags)) {
		/* Slab mgmt obj is off-slab. */
		slabp = kmem_cache_alloc(cache_slabp, local_flags);
	} else {
		/* Slab mgmnt at end of slab mem, placed so that
		 * the position is 'coloured'.
		 */
		void *end;
		end = objp + (cachep->c_num * cachep->c_offset);
		if (!SLAB_BUFCTL(cachep->c_flags))
			end += (cachep->c_num * sizeof(kmem_bufctl_t));
		slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end);
	}

	if (slabp) {
		slabp->s_inuse = 0;
		slabp->s_dma = 0;
		slabp->s_index = NULL;
	}

	return slabp;
}

static inline void
kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp,
				unsigned long ctor_flags)
{
	kmem_bufctl_t	**bufpp = &slabp->s_freep;
	unsigned long	num = cachep->c_num-1;

	do {
#if	SLAB_DEBUG_SUPPORT
		if (cachep->c_flags & SLAB_RED_ZONE) {
			*((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
			objp += BYTES_PER_WORD;
			*((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1;
		}
#endif	/* SLAB_DEBUG_SUPPORT */

		/* Constructors are not allowed to allocate memory from the same cache
		 * which they are a constructor for.  Otherwise, deadlock.
		 * They must also be threaded.
		 */
		if (cachep->c_ctor)
			cachep->c_ctor(objp, cachep, ctor_flags);
#if	SLAB_DEBUG_SUPPORT
		else if (cachep->c_flags & SLAB_POISON) {
			/* need to poison the objs */
			kmem_poison_obj(cachep, objp);
		}

		if (cachep->c_flags & SLAB_RED_ZONE) {
			if (*((unsigned long*)(objp+cachep->c_org_size)) !=
			    SLAB_RED_MAGIC1) {
				*((unsigned long*)(objp+cachep->c_org_size)) =
					SLAB_RED_MAGIC1;
				printk(KERN_ERR "kmem_init_obj: Bad rear redzone "
				       "after constructor - %s\n", cachep->c_name);
			}
			objp -= BYTES_PER_WORD;
			if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) {
				*((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
				printk(KERN_ERR "kmem_init_obj: Bad front redzone "
				       "after constructor - %s\n", cachep->c_name);
			}
		}
#endif	/* SLAB_DEBUG_SUPPORT */

		objp += cachep->c_offset;
		if (!slabp->s_index) {
			*bufpp = objp;
			objp += sizeof(kmem_bufctl_t);
		} else
			*bufpp = &slabp->s_index[num];
		bufpp = &(*bufpp)->buf_nextp;
	} while (num--);

	*bufpp = NULL;
}

/* Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
static int
kmem_cache_grow(kmem_cache_t * cachep, int flags)
{
	kmem_slab_t	*slabp;
	struct page	*page;
	void		*objp;
	size_t		 offset;
	unsigned int	 dma, local_flags;
	unsigned long	 ctor_flags;
	unsigned long	 save_flags;

	/* Be lazy and only check for valid flags here,
 	 * keeping it out of the critical path in kmem_cache_alloc().
	 */
	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) {
		printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n",
		       flags, cachep->c_name);
		flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW);
	}

	if (flags & SLAB_NO_GROW)
		return 0;

	/* The test for missing atomic flag is performed here, rather than
	 * the more obvious place, simply to reduce the critical path length
	 * in kmem_cache_alloc().  If a caller is slightly mis-behaving they
	 * will eventually be caught here (where it matters).
	 */
	if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) {
		printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n",
		       cachep->c_name);
		flags &= ~SLAB_LEVEL_MASK;
		flags |= SLAB_ATOMIC;
	}
	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
	local_flags = (flags & SLAB_LEVEL_MASK);
	if (local_flags == SLAB_ATOMIC) {
		/* Not allowed to sleep.  Need to tell a constructor about
		 * this - it might need to know...
		 */
		ctor_flags |= SLAB_CTOR_ATOMIC;
	}

	/* About to mess with non-constant members - lock. */
	spin_lock_irqsave(&cachep->c_spinlock, save_flags);

	/* Get colour for the slab, and cal the next value. */
	if (!(offset = cachep->c_colour_next--))
		cachep->c_colour_next = cachep->c_colour;
	offset *= cachep->c_align;
	cachep->c_dflags = SLAB_CFLGS_GROWN;

	cachep->c_growing++;
re_try:
	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);

	/* A series of memory allocations for a new slab.
	 * Neither the cache-chain semaphore, or cache-lock, are
	 * held, but the incrementing c_growing prevents this
	 * this cache from being reaped or shrunk.
	 * Note: The cache could be selected in for reaping in
	 * kmem_cache_reap(), but when the final test is made the
	 * growing value will be seen.
	 */

	/* Get mem for the objs. */
	if (!(objp = kmem_getpages(cachep, flags, &dma)))
		goto failed;

	/* Get slab mgmt. */
	if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
		goto opps1;
	if (dma)
		slabp->s_dma = 1;
	if (SLAB_BUFCTL(cachep->c_flags)) {
		slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags);
		if (!slabp->s_index)
			goto opps2;
	}

	/* Nasty!!!!!!  I hope this is OK. */
	dma = 1 << cachep->c_gfporder;
	page = &mem_map[MAP_NR(objp)];
	do {
		SLAB_SET_PAGE_CACHE(page, cachep);
		SLAB_SET_PAGE_SLAB(page, slabp);
		PageSetSlab(page);
		page++;
	} while (--dma);

	slabp->s_offset = offset;	/* It will fit... */
	objp += offset;		/* Address of first object. */
	slabp->s_mem = objp;

	/* For on-slab bufctls, c_offset is the distance between the start of
	 * an obj and its related bufctl.  For off-slab bufctls, c_offset is
	 * the distance between objs in the slab.
	 */
	kmem_cache_init_objs(cachep, slabp, objp, ctor_flags);

	spin_lock_irq(&cachep->c_spinlock);

	/* Make slab active. */
	slabp->s_magic = SLAB_MAGIC_ALLOC;
	kmem_slab_link_end(cachep, slabp);
	if (cachep->c_freep == kmem_slab_end(cachep))
		cachep->c_freep = slabp;
	SLAB_STATS_INC_GROWN(cachep);
	cachep->c_failures = 0;
	cachep->c_growing--;

	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
	return 1;
opps2:
	if (SLAB_OFF_SLAB(cachep->c_flags))
		kmem_cache_free(cache_slabp, slabp);
opps1:
	kmem_freepages(cachep, objp); 
failed:
	spin_lock_irq(&cachep->c_spinlock);
	if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
		/* For large order (>0) slabs, we try again.
		 * Needed because the gfp() functions are not good at giving
		 * out contigious pages unless pushed (but do not push too hard).
		 */
		if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
			goto re_try;
		cachep->c_failures = 1;	/* Memory is low, don't try as hard next time. */
	}
	cachep->c_growing--;
	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
	return 0;
}

static void
kmem_report_alloc_err(const char *str, kmem_cache_t * cachep)
{
	if (cachep)
		SLAB_STATS_INC_ERR(cachep);	/* this is atomic */
	printk(KERN_ERR "kmem_alloc: %s (name=%s)\n",
	       str, cachep ? cachep->c_name : "unknown");
}

static void
kmem_report_free_err(const char *str, const void *objp, kmem_cache_t * cachep)
{
	if (cachep)
		SLAB_STATS_INC_ERR(cachep);
	printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n",
	       str, objp, cachep ? cachep->c_name : "unknown");
}

/* Search for a slab whose objs are suitable for DMA.
 * Note: since testing the first free slab (in __kmem_cache_alloc()),
 * ints must not have been enabled, or the cache-lock released!
 */
static inline kmem_slab_t *
kmem_cache_search_dma(kmem_cache_t * cachep)
{
	kmem_slab_t	*slabp = cachep->c_freep->s_nextp;

	for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
		if (!(slabp->s_dma))
			continue;
		kmem_slab_unlink(slabp);
		kmem_slab_link_free(cachep, slabp);
		cachep->c_freep = slabp;
		break;
	}
	return slabp;
}

#if	SLAB_DEBUG_SUPPORT
/* Perform extra freeing checks.  Currently, this check is only for caches
 * that use bufctl structures within the slab.  Those which use bufctl's
 * from the internal cache have a reasonable check when the address is
 * searched for.  Called with the cache-lock held.
 */
static void *
kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp,
		       kmem_bufctl_t *bufp, void * objp)
{
	if (SLAB_BUFCTL(cachep->c_flags))
		return objp;

	/* Check slab's freelist to see if this obj is there. */
	for (; search_bufp; search_bufp = search_bufp->buf_nextp) {
		if (search_bufp != bufp)
			continue;
		return NULL;
	}
	return objp;
}
#endif	/* SLAB_DEBUG_SUPPORT */

/* Called with cache lock held. */
static inline void
kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
{
	if (slabp->s_nextp->s_inuse) {
		/* Not at correct position. */
		if (cachep->c_freep == slabp)
			cachep->c_freep = slabp->s_nextp;
		kmem_slab_unlink(slabp);
		kmem_slab_link_end(cachep, slabp);
	}
}

/* Called with cache lock held. */
static inline void
kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
{
	if (slabp->s_nextp->s_inuse == cachep->c_num) {
		kmem_slab_unlink(slabp);
		kmem_slab_link_free(cachep, slabp);
	}
	cachep->c_freep = slabp;
}

/* Returns a ptr to an obj in the given cache. */
static inline void *
__kmem_cache_alloc(kmem_cache_t *cachep, int flags)
{
	kmem_slab_t	*slabp;
	kmem_bufctl_t	*bufp;
	void		*objp;
	unsigned long	save_flags;

	/* Sanity check. */
	if (!cachep)
		goto nul_ptr;
	spin_lock_irqsave(&cachep->c_spinlock, save_flags);
try_again:
	/* Get slab alloc is to come from. */
	slabp = cachep->c_freep;

	/* Magic is a sanity check _and_ says if we need a new slab. */
	if (slabp->s_magic != SLAB_MAGIC_ALLOC)
		goto alloc_new_slab;
	/* DMA requests are 'rare' - keep out of the critical path. */
	if (flags & SLAB_DMA)
		goto search_dma;
try_again_dma:
	SLAB_STATS_INC_ALLOCED(cachep);
	SLAB_STATS_INC_ACTIVE(cachep);
	SLAB_STATS_SET_HIGH(cachep);
	slabp->s_inuse++;
	bufp = slabp->s_freep;
	slabp->s_freep = bufp->buf_nextp;
	if (slabp->s_freep) {
ret_obj:
		if (!slabp->s_index) {
			bufp->buf_slabp = slabp;
			objp = ((void*)bufp) - cachep->c_offset;
finished:
			/* The lock is not needed by the red-zone or poison ops, and the
			 * obj has been removed from the slab.  Should be safe to drop
			 * the lock here.
			 */
			spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
#if	SLAB_DEBUG_SUPPORT
			if (cachep->c_flags & SLAB_RED_ZONE)
				goto red_zone;
ret_red:
			if ((cachep->c_flags & SLAB_POISON) && kmem_check_poison_obj(cachep, objp))
				kmem_report_alloc_err("Bad poison", cachep);
#endif	/* SLAB_DEBUG_SUPPORT */
			return objp;
		}
		/* Update index ptr. */
		objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem;
		bufp->buf_objp = objp;
		goto finished;
	}
	cachep->c_freep = slabp->s_nextp;
	goto ret_obj;

#if	SLAB_DEBUG_SUPPORT
red_zone:
	/* Set alloc red-zone, and check old one. */
	if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
		kmem_report_alloc_err("Bad front redzone", cachep);
	objp += BYTES_PER_WORD;
	if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
		kmem_report_alloc_err("Bad rear redzone", cachep);
	goto ret_red;
#endif	/* SLAB_DEBUG_SUPPORT */

search_dma:
	if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep))
		goto try_again_dma;
alloc_new_slab:
	/* Either out of slabs, or magic number corruption. */
	if (slabp == kmem_slab_end(cachep)) {
		/* Need a new slab.  Release the lock before calling kmem_cache_grow().
		 * This allows objs to be released back into the cache while growing.
		 */
		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
		if (kmem_cache_grow(cachep, flags)) {
			/* Someone may have stolen our objs.  Doesn't matter, we'll
			 * just come back here again.
			 */
			spin_lock_irq(&cachep->c_spinlock);
			goto try_again;
		}
		/* Couldn't grow, but some objs may have been freed. */
		spin_lock_irq(&cachep->c_spinlock);
		if (cachep->c_freep != kmem_slab_end(cachep))
			goto try_again;
	} else {
		/* Very serious error - maybe panic() here? */
		kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
	}
	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
err_exit:
	return NULL;
nul_ptr:
	kmem_report_alloc_err("NULL ptr", NULL);
	goto err_exit;
}

/* Release an obj back to its cache.  If the obj has a constructed state,
 * it should be in this state _before_ it is released.
 */
static inline void
__kmem_cache_free(kmem_cache_t *cachep, const void *objp)
{
	kmem_slab_t	*slabp;
	kmem_bufctl_t	*bufp;
	unsigned long	save_flags;

	/* Basic sanity checks. */
	if (!cachep || !objp)
		goto null_addr;

#if	SLAB_DEBUG_SUPPORT
	/* A verify func is called without the cache-lock held. */
	if (cachep->c_flags & SLAB_DEBUG_INITIAL)
		goto init_state_check;
finished_initial:

	if (cachep->c_flags & SLAB_RED_ZONE)
		goto red_zone;
return_red:
#endif	/* SLAB_DEBUG_SUPPORT */

	spin_lock_irqsave(&cachep->c_spinlock, save_flags);

	if (SLAB_BUFCTL(cachep->c_flags))
		goto bufctl;
	bufp = (kmem_bufctl_t *)(objp+cachep->c_offset);

	/* Get slab for the object. */
#if	0
	/* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects.
	 * Is this worth while? XXX
	 */
	if (cachep->c_flags & SLAB_HIGH_PACK)
		slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]);
	else
#endif
		slabp = bufp->buf_slabp;

check_magic:
	if (slabp->s_magic != SLAB_MAGIC_ALLOC)		/* Sanity check. */
		goto bad_slab;

#if	SLAB_DEBUG_SUPPORT
	if (cachep->c_flags & SLAB_DEBUG_FREE)
		goto extra_checks;
passed_extra:
#endif	/* SLAB_DEBUG_SUPPORT */

	if (slabp->s_inuse) {		/* Sanity check. */
		SLAB_STATS_DEC_ACTIVE(cachep);
		slabp->s_inuse--;
		bufp->buf_nextp = slabp->s_freep;
		slabp->s_freep = bufp;
		if (bufp->buf_nextp) {
			if (slabp->s_inuse) {
				/* (hopefully) The most common case. */
finished:
#if	SLAB_DEBUG_SUPPORT
				if (cachep->c_flags & SLAB_POISON) {
					if (cachep->c_flags & SLAB_RED_ZONE)
						objp += BYTES_PER_WORD;
					kmem_poison_obj(cachep, objp);
				}
#endif	/* SLAB_DEBUG_SUPPORT */
				spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
				return;
			}
			kmem_cache_full_free(cachep, slabp);
			goto finished;
		}
		kmem_cache_one_free(cachep, slabp);
		goto finished;
	}

	/* Don't add to freelist. */
	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
	kmem_report_free_err("free with no active objs", objp, cachep);
	return;
bufctl:
	/* No 'extra' checks are performed for objs stored this way, finding
	 * the obj is check enough.
	 */
	slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]);
	bufp =	&slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset];
	if (bufp->buf_objp == objp)
		goto check_magic;
	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
	kmem_report_free_err("Either bad obj addr or double free", objp, cachep);
	return;
#if	SLAB_DEBUG_SUPPORT
init_state_check:
	/* Need to call the slab's constructor so the
	 * caller can perform a verify of its state (debugging).
	 */
	cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
	goto finished_initial;
extra_checks:
	if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) {
		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
		kmem_report_free_err("Double free detected during checks", objp, cachep);
		return;
	}
	goto passed_extra;
red_zone:
	/* We do not hold the cache-lock while checking the red-zone.
	 */
	objp -= BYTES_PER_WORD;
	if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
		/* Either write before start of obj, or a double free. */
		kmem_report_free_err("Bad front redzone", objp, cachep);
	}
	if (xchg((unsigned long *)(objp+cachep->c_org_size+BYTES_PER_WORD), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
		/* Either write past end of obj, or a double free. */
		kmem_report_free_err("Bad rear redzone", objp, cachep);
	}
	goto return_red;
#endif	/* SLAB_DEBUG_SUPPORT */

bad_slab:
	/* Slab doesn't contain the correct magic num. */
	if (slabp->s_magic == SLAB_MAGIC_DESTROYED) {
		/* Magic num says this is a destroyed slab. */
		kmem_report_free_err("free from inactive slab", objp, cachep);
	} else
		kmem_report_free_err("Bad obj addr", objp, cachep);
	spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);

#if 1
/* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
*(int *) 0 = 0;
#endif

	return;
null_addr:
	kmem_report_free_err("NULL ptr", objp, cachep);
	return;
}

void *
kmem_cache_alloc(kmem_cache_t *cachep, int flags)
{
	return __kmem_cache_alloc(cachep, flags);
}

void
kmem_cache_free(kmem_cache_t *cachep, void *objp)
{
	__kmem_cache_free(cachep, objp);
}

void *
kmalloc(size_t size, int flags)
{
	cache_sizes_t	*csizep = cache_sizes;

	for (; csizep->cs_size; csizep++) {
		if (size > csizep->cs_size)
			continue;
		return __kmem_cache_alloc(csizep->cs_cachep, flags);
	}
	printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size);
	return NULL;
}

void
kfree(const void *objp)
{
	struct page *page;
	int	nr;

	if (!objp)
		goto null_ptr;
	nr = MAP_NR(objp);
	if (nr >= max_mapnr)
		goto bad_ptr;

	/* Assume we own the page structure - hence no locking.
	 * If someone is misbehaving (eg. someone calling us with a bad
	 * address), then access to the page structure can race with the
	 * kmem_slab_destory() code.  Need to add a spin_lock to each page
	 * structure, which would be useful in threading the gfp() functions....
	 */
	page = &mem_map[nr];
	if (PageSlab(page)) {
		kmem_cache_t	*cachep;

		/* Here, we (again) assume the obj address is good.
		 * If it isn't, and happens to map onto another
		 * general-cache page which has no active objs, then
		 * we race....
		 */
		cachep = SLAB_GET_PAGE_CACHE(page);
		if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
			__kmem_cache_free(cachep, objp);
			return;
		}
	}
bad_ptr:
	printk(KERN_ERR "kfree: Bad obj %p\n", objp);

#if 1
/* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
*(int *) 0 = 0;
#endif

null_ptr:
	return;
}

void
kfree_s(const void *objp, size_t size)
{
	struct page *page;
	int	nr;

	if (!objp)
		goto null_ptr;
	nr = MAP_NR(objp);
	if (nr >= max_mapnr)
		goto null_ptr;
	/* See comment in kfree() */
	page = &mem_map[nr];
	if (PageSlab(page)) {
		kmem_cache_t	*cachep;
		/* See comment in kfree() */
		cachep = SLAB_GET_PAGE_CACHE(page);
		if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) {
			if (size <= cachep->c_org_size) {	/* XXX better check */
				__kmem_cache_free(cachep, objp);
				return;
			}
		}
	}
null_ptr:
	printk(KERN_ERR "kfree_s: Bad obj %p\n", objp);
	return;
}

kmem_cache_t *
kmem_find_general_cachep(size_t size)
{
	cache_sizes_t	*csizep = cache_sizes;

	/* This function could be moved to the header-file, and
	 * made inline so consumers can quickly determine what
	 * cache-ptr they require.
	 */
	for (; csizep->cs_size; csizep++) {
		if (size > csizep->cs_size)
			continue;
		break;
	}
	return csizep->cs_cachep;
}


/* Called from try_to_free_page().
 * This function _cannot_ be called within a int, but it
 * can be interrupted.
 */
void
kmem_cache_reap(int gfp_mask)
{
	kmem_slab_t	*slabp;
	kmem_cache_t	*searchp;
	kmem_cache_t	*best_cachep;
	unsigned int	 scan;
	unsigned int	 reap_level;

	if (in_interrupt()) {
		printk("kmem_cache_reap() called within int!\n");
		return;
	}

	/* We really need a test semphore op so we can avoid sleeping when
	 * !wait is true.
	 */
	down(&cache_chain_sem);

	scan = 10;
	reap_level = 0;

	best_cachep = NULL;
	searchp = clock_searchp;
	do {
		unsigned int	full_free;
		unsigned int	dma_flag;

		/* It's safe to test this without holding the cache-lock. */
		if (searchp->c_flags & SLAB_NO_REAP)
			goto next;
		spin_lock_irq(&searchp->c_spinlock);
		if (searchp->c_growing)
			goto next_unlock;
		if (searchp->c_dflags & SLAB_CFLGS_GROWN) {
			searchp->c_dflags &= ~SLAB_CFLGS_GROWN;
			goto next_unlock;
		}
		/* Sanity check for corruption of static values. */
		if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) {
			spin_unlock_irq(&searchp->c_spinlock);
			printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name);
			goto next;
		}
		dma_flag = 0;
		full_free = 0;

		/* Count num of fully free slabs.  Hopefully there are not many,
		 * we are holding the cache lock....
		 */
		slabp = searchp->c_lastp;
		while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
			slabp = slabp->s_prevp;
			full_free++;
			if (slabp->s_dma)
				dma_flag++;
		}
		spin_unlock_irq(&searchp->c_spinlock);

		if ((gfp_mask & GFP_DMA) && !dma_flag)
			goto next;

		if (full_free) {
			if (full_free >= 10) {
				best_cachep = searchp;
				break;
			}

			/* Try to avoid slabs with constructors and/or
			 * more than one page per slab (as it can be difficult
			 * to get high orders from gfp()).
			 */
			if (full_free >= reap_level) {
				reap_level = full_free;
				best_cachep = searchp;
			}
		}
		goto next;
next_unlock:
		spin_unlock_irq(&searchp->c_spinlock);
next:
		searchp = searchp->c_nextp;
	} while (--scan && searchp != clock_searchp);

	clock_searchp = searchp;
	up(&cache_chain_sem);

	if (!best_cachep) {
		/* couldn't find anthying to reap */
		return;
	}

	spin_lock_irq(&best_cachep->c_spinlock);
	if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) {
		if (gfp_mask & GFP_DMA) {
			do {
				if (slabp->s_dma)
					goto good_dma;
				slabp = slabp->s_prevp;
			} while (!slabp->s_inuse && slabp != kmem_slab_end(best_cachep));

			/* Didn't found a DMA slab (there was a free one -
			 * must have been become active).
			 */
			goto dma_fail;
good_dma:
		}
		if (slabp == best_cachep->c_freep)
			best_cachep->c_freep = slabp->s_nextp;
		kmem_slab_unlink(slabp);
		SLAB_STATS_INC_REAPED(best_cachep);

		/* Safe to drop the lock.  The slab is no longer linked to the
		 * cache.
		 */
		spin_unlock_irq(&best_cachep->c_spinlock);
		kmem_slab_destroy(best_cachep, slabp);
		return;
	}
dma_fail:
	spin_unlock_irq(&best_cachep->c_spinlock);
	return;
}

#if	SLAB_SELFTEST
/* A few v. simple tests */
static void
kmem_self_test(void)
{
	kmem_cache_t	*test_cachep;

	printk(KERN_INFO "kmem_test() - start\n");
	test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISON, NULL, NULL);
	if (test_cachep) {
		char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
		if (objp) {
			/* Write in front and past end, red-zone test. */
			*(objp-1) = 1;
			*(objp+16) = 1;
			kmem_cache_free(test_cachep, objp);

			/* Mess up poisoning. */
			*objp = 10;
			objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
			kmem_cache_free(test_cachep, objp);

			/* Mess up poisoning (again). */
			*objp = 10;
			kmem_cache_shrink(test_cachep);
		}
	}
	printk(KERN_INFO "kmem_test() - finished\n");
}
#endif	/* SLAB_SELFTEST */

#if	defined(CONFIG_PROC_FS)
/* /proc/slabinfo
 * cache-name num-active-objs total-objs num-active-slabs total-slabs num-pages-per-slab
 */
int
get_slabinfo(char *buf)
{
	kmem_cache_t	*cachep;
	kmem_slab_t	*slabp;
	unsigned long	active_objs;
	unsigned long	save_flags;
	unsigned long	num_slabs;
	unsigned long	num_objs;
	int		len=0;
#if	SLAB_STATS
	unsigned long	active_slabs;
#endif	/* SLAB_STATS */

	__save_flags(save_flags);

	/* Output format version, so at least we can change it without _too_
	 * many complaints.
	 */
#if	SLAB_STATS
	len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n");
#else
	len = sprintf(buf, "slabinfo - version: 1.0\n");
#endif	/* SLAB_STATS */
	down(&cache_chain_sem);
	cachep = &cache_cache;
	do {
#if	SLAB_STATS
		active_slabs = 0;
#endif	/* SLAB_STATS */
		num_slabs = active_objs = 0;
		spin_lock_irq(&cachep->c_spinlock);
		for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
			active_objs += slabp->s_inuse;
			num_slabs++;
#if	SLAB_STATS
			if (slabp->s_inuse)
				active_slabs++;
#endif	/* SLAB_STATS */
		}
		num_objs = cachep->c_num*num_slabs;
#if	SLAB_STATS
		{
		unsigned long errors;
		unsigned long high = cachep->c_high_mark;
		unsigned long grown = cachep->c_grown;
		unsigned long reaped = cachep->c_reaped;
		unsigned long allocs = cachep->c_num_allocations;
		errors = (unsigned long) atomic_read(&cachep->c_errors);
		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
		len += sprintf(buf+len, "%-16s %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n",
				cachep->c_name, active_objs, num_objs, active_slabs, num_slabs,
				(1<<cachep->c_gfporder)*num_slabs,
				high, allocs, grown, reaped, errors);
		}
#else
		spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
		len += sprintf(buf+len, "%-17s %6lu %6lu\n", cachep->c_name, active_objs, num_objs);
#endif	/* SLAB_STATS */
	} while ((cachep = cachep->c_nextp) != &cache_cache);
	up(&cache_chain_sem);

	return len;
}
#endif	/* CONFIG_PROC_FS */