Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | /* * linux/fs/nfs/direct.c * * Copyright (C) 2001 by Chuck Lever <cel@netapp.com> * * High-performance uncached I/O for the Linux NFS client * * There are important applications whose performance or correctness * depends on uncached access to file data. Database clusters * (multiple copies of the same instance running on separate hosts) * implement their own cache coherency protocol that subsumes file * system cache protocols. Applications that process datasets * considerably larger than the client's memory do not always benefit * from a local cache. A streaming video server, for instance, has no * need to cache the contents of a file. * * When an application requests uncached I/O, all read and write requests * are made directly to the server; data stored or fetched via these * requests is not cached in the Linux page cache. The client does not * correct unaligned requests from applications. All requested bytes are * held on permanent storage before a direct write system call returns to * an application. * * Solaris implements an uncached I/O facility called directio() that * is used for backups and sequential I/O to very large files. Solaris * also supports uncaching whole NFS partitions with "-o forcedirectio," * an undocumented mount option. * * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. * * 18 Dec 2001 Initial implementation for 2.4 --cel * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy * 24 Sep 2002 Rewrite to use asynchronous RPCs, port to 2.5 --cel * */ #include <linux/config.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/file.h> #include <linux/errno.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/sunrpc/clnt.h> #include <asm/system.h> #include <asm/uaccess.h> #define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) #define VERF_SIZE (2 * sizeof(__u32)) /** * nfs_get_user_pages - find and set up page representing user buffer * addr: user-space address of target buffer * size: total size in bytes of target buffer * @pages: returned array of page struct pointers underlying target buffer * write: whether or not buffer is target of a write operation */ static inline int nfs_get_user_pages(unsigned long addr, size_t size, struct page ***pages, int rw) { int result = -ENOMEM; unsigned page_count = (unsigned) size >> PAGE_SHIFT; unsigned array_size = (page_count * sizeof(struct page *)) + 2U; *pages = (struct page **) kmalloc(array_size, GFP_KERNEL); if (*pages) { down_read(¤t->mm->mmap_sem); result = get_user_pages(current, current->mm, addr, page_count, (rw == WRITE), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) printk(KERN_ERR "%s: get_user_pages result %d\n", __FUNCTION__, result); } return result; } /** * nfs_free_user_pages - tear down page struct array * @pages: array of page struct pointers underlying target buffer */ static inline void nfs_free_user_pages(struct page **pages, unsigned count) { unsigned page = 0; while (count--) page_cache_release(pages[page++]); kfree(pages); } /** * nfs_iov2pagelist - convert an array of iovecs to a list of page requests * @inode: inode of target file * @cred: credentials of user who requested I/O * @iov: array of vectors that define I/O buffer * offset: where in file to begin the read * nr_segs: size of iovec array * @requests: append new page requests to this list head */ static int nfs_iov2pagelist(int rw, const struct inode *inode, const struct rpc_cred *cred, const struct iovec *iov, loff_t offset, unsigned long nr_segs, struct list_head *requests) { unsigned seg; int tot_bytes = 0; struct page **pages; /* for each iovec in the array... */ for (seg = 0; seg < nr_segs; seg++) { const unsigned long user_addr = (unsigned long) iov[seg].iov_base; size_t bytes = iov[seg].iov_len; unsigned int pg_offset = (user_addr & ~PAGE_MASK); int page_count, page = 0; page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw); if (page_count < 0) { nfs_release_list(requests); return page_count; } /* ...build as many page requests as required */ while (bytes > 0) { struct nfs_page *new; const unsigned int pg_bytes = (bytes > PAGE_SIZE) ? PAGE_SIZE : bytes; new = nfs_create_request((struct rpc_cred *) cred, (struct inode *) inode, pages[page], pg_offset, pg_bytes); if (IS_ERR(new)) { nfs_free_user_pages(pages, page_count); nfs_release_list(requests); return PTR_ERR(new); } new->wb_index = offset; nfs_list_add_request(new, requests); /* after the first page */ pg_offset = 0; offset += PAGE_SIZE; tot_bytes += pg_bytes; bytes -= pg_bytes; page++; } /* don't release pages here -- I/O completion will do that */ nfs_free_user_pages(pages, 0); } return tot_bytes; } /** * do_nfs_direct_IO - Read or write data without caching * @inode: inode of target file * @cred: credentials of user who requested I/O * @iov: array of vectors that define I/O buffer * offset: where in file to begin the read * nr_segs: size of iovec array * * Break the passed-in iovec into a series of page-sized or smaller * requests, where each page is mapped for direct user-land I/O. * * For each of these pages, create an NFS page request and * append it to an automatic list of page requests. * * When all page requests have been queued, start the I/O on the * whole list. The underlying routines coalesce the pages on the * list into a bunch of asynchronous "r/wsize" network requests. * * I/O completion automatically unmaps and releases the pages. */ static int do_nfs_direct_IO(int rw, const struct inode *inode, const struct rpc_cred *cred, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { LIST_HEAD(requests); int result, tot_bytes; result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs, &requests); if (result < 0) return result; tot_bytes = result; switch (rw) { case READ: if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) { result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs); break; } result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages); nfs_wait_for_reads(&requests); break; case WRITE: if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE)) result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs); else result = nfs_flush_list(&requests, NFS_SERVER(inode)->wpages, FLUSH_WAIT); /* invalidate cache so non-direct readers pick up changes */ invalidate_inode_pages((struct inode *) inode); break; default: result = -EINVAL; break; } if (result < 0) return result; return tot_bytes; } /** * nfs_direct_IO - NFS address space operation for direct I/O * rw: direction (read or write) * @file: file struct of target file * @iov: array of vectors that define I/O buffer * offset: offset in file to begin the operation * nr_segs: size of iovec array * * The inode's i_sem is no longer held by the VFS layer before it calls * this function to do a write. */ int nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { /* None of this works yet, so prevent it from compiling. */ #if 0 int result; struct dentry *dentry = file->f_dentry; const struct inode *inode = dentry->d_inode->i_mapping->host; const struct rpc_cred *cred = nfs_file_cred(file); #endif dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n", ((rw == READ) ? "READ" : "WRITE"), dentry->d_parent->d_name.name, dentry->d_name.name, offset, nr_segs); result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs); dfprintk(VFS, "NFS: direct_IO result = %d\n", result); return result; } |