8654 lines
273 KiB
Diff
Executable File
8654 lines
273 KiB
Diff
Executable File
Patchset: 2.6.26-ext4-7
|
|
URL: http://kernel.org/pub/linux/kernel/people/tytso/ext4-patches/2.6.26-ext4-7/2.6.26-ext4-7.bz2
|
|
|
|
This patch was created by combining the ext4-pushed-post-2.6.27-rc1.gz
|
|
patches with the stable patches in 2.6.27-rc3-ext4-1 series.
|
|
Was changed on line numbers for clean patch into colinux kernel tree.
|
|
|
|
Documentation/filesystems/ext4.txt | 131 ++-
|
|
fs/buffer.c | 19 +-
|
|
fs/ext4/acl.c | 188 ++--
|
|
fs/ext4/balloc.c | 221 +++--
|
|
fs/ext4/dir.c | 37 +-
|
|
fs/ext4/ext4.h | 64 +-
|
|
fs/ext4/ext4_extents.h | 5 +-
|
|
fs/ext4/ext4_i.h | 10 +-
|
|
fs/ext4/ext4_jbd2.h | 29 +-
|
|
fs/ext4/ext4_sb.h | 5 +-
|
|
fs/ext4/extents.c | 277 +++---
|
|
fs/ext4/file.c | 20 +-
|
|
fs/ext4/fsync.c | 4 +
|
|
fs/ext4/group.h | 2 +-
|
|
fs/ext4/ialloc.c | 169 +++-
|
|
fs/ext4/inode.c | 1931 ++++++++++++++++++++++++++++++------
|
|
fs/ext4/mballoc.c | 744 +++++++++++----
|
|
fs/ext4/mballoc.h | 10 +-
|
|
fs/ext4/migrate.c | 3 +-
|
|
fs/ext4/namei.c | 45 +-
|
|
fs/ext4/resize.c | 134 ++-
|
|
fs/ext4/super.c | 451 ++++++---
|
|
fs/ext4/xattr.c | 4 +-
|
|
fs/ext4/xattr_trusted.c | 4 +-
|
|
fs/ext4/xattr_user.c | 4 +-
|
|
fs/jbd2/checkpoint.c | 1 -
|
|
fs/jbd2/commit.c | 308 +++----
|
|
fs/jbd2/journal.c | 54 +-
|
|
fs/jbd2/transaction.c | 365 +++----
|
|
fs/mpage.c | 14 +-
|
|
include/linux/fs.h | 2 +
|
|
include/linux/jbd2.h | 73 +-
|
|
include/linux/mpage.h | 10 +
|
|
include/linux/percpu_counter.h | 12 +-
|
|
include/linux/writeback.h | 1 +
|
|
lib/percpu_counter.c | 7 +-
|
|
mm/filemap.c | 3 +-
|
|
mm/page-writeback.c | 3 +
|
|
38 files changed, 3822 insertions(+), 1542 deletions(-)
|
|
|
|
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
|
|
index 0c5086d..0d53949 100644
|
|
--- a/Documentation/filesystems/ext4.txt
|
|
+++ b/Documentation/filesystems/ext4.txt
|
|
@@ -13,72 +13,99 @@ Mailing list: linux-ext4@vger.kernel.org
|
|
1. Quick usage instructions:
|
|
===========================
|
|
|
|
- - Grab updated e2fsprogs from
|
|
- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
|
|
- This is a patchset on top of e2fsprogs-1.39, which can be found at
|
|
+ - Compile and install the latest version of e2fsprogs (as of this
|
|
+ writing version 1.41) from:
|
|
+
|
|
+ http://sourceforge.net/project/showfiles.php?group_id=2406
|
|
+
|
|
+ or
|
|
+
|
|
ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
|
|
|
|
- - It's still mke2fs -j /dev/hda1
|
|
+ or grab the latest git repository from:
|
|
+
|
|
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
|
|
+
|
|
+ - Note that it is highly important to install the mke2fs.conf file
|
|
+ that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
|
|
+ you have edited the /etc/mke2fs.conf file installed on your system,
|
|
+ you will need to merge your changes with the version from e2fsprogs
|
|
+ 1.41.x.
|
|
+
|
|
+ - Create a new filesystem using the ext4dev filesystem type:
|
|
+
|
|
+ # mke2fs -t ext4dev /dev/hda1
|
|
+
|
|
+ Or configure an existing ext3 filesystem to support extents and set
|
|
+ the test_fs flag to indicate that it's ok for an in-development
|
|
+ filesystem to touch this filesystem:
|
|
|
|
- - mount /dev/hda1 /wherever -t ext4dev
|
|
+ # tune2fs -O extents -E test_fs /dev/hda1
|
|
|
|
- - To enable extents,
|
|
+ If the filesystem was created with 128 byte inodes, it can be
|
|
+ converted to use 256 byte for greater efficiency via:
|
|
|
|
- mount /dev/hda1 /wherever -t ext4dev -o extents
|
|
+ # tune2fs -I 256 /dev/hda1
|
|
|
|
- - The filesystem is compatible with the ext3 driver until you add a file
|
|
- which has extents (ie: `mount -o extents', then create a file).
|
|
+ (Note: we currently do not have tools to convert an ext4dev
|
|
+ filesystem back to ext3; so please do not do try this on production
|
|
+ filesystems.)
|
|
|
|
- NOTE: The "extents" mount flag is temporary. It will soon go away and
|
|
- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
|
|
+ - Mounting:
|
|
+
|
|
+ # mount -t ext4dev /dev/hda1 /wherever
|
|
|
|
- When comparing performance with other filesystems, remember that
|
|
- ext3/4 by default offers higher data integrity guarantees than most. So
|
|
- when comparing with a metadata-only journalling filesystem, use `mount -o
|
|
- data=writeback'. And you might as well use `mount -o nobh' too along
|
|
- with it. Making the journal larger than the mke2fs default often helps
|
|
- performance with metadata-intensive workloads.
|
|
+ ext3/4 by default offers higher data integrity guarantees than most.
|
|
+ So when comparing with a metadata-only journalling filesystem, such
|
|
+ as ext3, use `mount -o data=writeback'. And you might as well use
|
|
+ `mount -o nobh' too along with it. Making the journal larger than
|
|
+ the mke2fs default often helps performance with metadata-intensive
|
|
+ workloads.
|
|
|
|
2. Features
|
|
===========
|
|
|
|
2.1 Currently available
|
|
|
|
-* ability to use filesystems > 16TB
|
|
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
|
|
* extent format reduces metadata overhead (RAM, IO for access, transactions)
|
|
* extent format more robust in face of on-disk corruption due to magics,
|
|
* internal redunancy in tree
|
|
-
|
|
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
|
|
-
|
|
-* dir_index and resize inode will be on by default
|
|
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
|
|
+* improved file allocation (multi-block alloc)
|
|
+* fix 32000 subdirectory limit
|
|
+* nsec timestamps for mtime, atime, ctime, create time
|
|
+* inode version field on disk (NFSv4, Lustre)
|
|
+* reduced e2fsck time via uninit_bg feature
|
|
+* journal checksumming for robustness, performance
|
|
+* persistent file preallocation (e.g for streaming media, databases)
|
|
+* ability to pack bitmaps and inode tables into larger virtual groups via the
|
|
+ flex_bg feature
|
|
+* large file support
|
|
+* Inode allocation using large virtual block groups via flex_bg
|
|
+* delayed allocation
|
|
+* large block (up to pagesize) support
|
|
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
|
|
+ the ordering)
|
|
|
|
2.2 Candidate features for future inclusion
|
|
|
|
-There are several under discussion, whether they all make it in is
|
|
-partly a function of how much time everyone has to work on them:
|
|
+* Online defrag (patches available but not well tested)
|
|
+* reduced mke2fs time via lazy itable initialization in conjuction with
|
|
+ the uninit_bg feature (capability to do this is available in e2fsprogs
|
|
+ but a kernel thread to do lazy zeroing of unused inode table blocks
|
|
+ after filesystem is first mounted is required for safety)
|
|
|
|
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
|
|
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
|
|
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
|
|
- needs some e2fsck work)
|
|
-* inode version field on disk (NFSv4, Lustre; prototype exists)
|
|
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
|
|
-* journal checksumming for robustness, performance (prototype exists)
|
|
-* persistent file preallocation (e.g for streaming media, databases)
|
|
+There are several others under discussion, whether they all make it in is
|
|
+partly a function of how much time everyone has to work on them. Features like
|
|
+metadata checksumming have been discussed and planned for a bit but no patches
|
|
+exist yet so I'm not sure they're in the near-term roadmap.
|
|
|
|
-Features like metadata checksumming have been discussed and planned for
|
|
-a bit but no patches exist yet so I'm not sure they're in the near-term
|
|
-roadmap.
|
|
+The big performance win will come with mballoc, delalloc and flex_bg
|
|
+grouping of bitmaps and inode tables. Some test results available here:
|
|
|
|
-The big performance win will come with mballoc and delalloc. CFS has
|
|
-been using mballoc for a few years already with Lustre, and IBM + Bull
|
|
-did a lot of benchmarking on it. The reason it isn't in the first set of
|
|
-patches is partly a manageability issue, and partly because it doesn't
|
|
-directly affect the on-disk format (outside of much better allocation)
|
|
-so it isn't critical to get into the first round of changes. I believe
|
|
-Alex is working on a new set of patches right now.
|
|
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
|
|
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
|
|
|
|
3. Options
|
|
==========
|
|
@@ -222,9 +249,11 @@ stripe=n Number of filesystem blocks that mballoc will try
|
|
to use for allocation size and alignment. For RAID5/6
|
|
systems this should be the number of data
|
|
disks * RAID chunk size in file system blocks.
|
|
-
|
|
+delalloc (*) Deferring block allocation until write-out time.
|
|
+nodelalloc Disable delayed allocation. Blocks are allocation
|
|
+ when data is copied from user to page cache.
|
|
Data Mode
|
|
----------
|
|
+=========
|
|
There are 3 different data modes:
|
|
|
|
* writeback mode
|
|
@@ -236,10 +265,10 @@ typically provide the best ext4 performance.
|
|
|
|
* ordered mode
|
|
In data=ordered mode, ext4 only officially journals metadata, but it logically
|
|
-groups metadata and data blocks into a single unit called a transaction. When
|
|
-it's time to write the new metadata out to disk, the associated data blocks
|
|
-are written first. In general, this mode performs slightly slower than
|
|
-writeback but significantly faster than journal mode.
|
|
+groups metadata information related to data changes with the data blocks into a
|
|
+single unit called a transaction. When it's time to write the new metadata
|
|
+out to disk, the associated data blocks are written first. In general,
|
|
+this mode performs slightly slower than writeback but significantly faster than journal mode.
|
|
|
|
* journal mode
|
|
data=journal mode provides full data and metadata journaling. All new data is
|
|
@@ -247,7 +276,8 @@ written to the journal first, and then to its final location.
|
|
In the event of a crash, the journal can be replayed, bringing both data and
|
|
metadata into a consistent state. This mode is the slowest except when data
|
|
needs to be read from and written to disk at the same time where it
|
|
-outperforms all others modes.
|
|
+outperforms all others modes. Curently ext4 does not have delayed
|
|
+allocation support if this data journalling mode is selected.
|
|
|
|
References
|
|
==========
|
|
@@ -256,7 +286,8 @@ kernel source: <file:fs/ext4/>
|
|
<file:fs/jbd2/>
|
|
|
|
programs: http://e2fsprogs.sourceforge.net/
|
|
- http://ext2resize.sourceforge.net
|
|
|
|
useful links: http://fedoraproject.org/wiki/ext3-devel
|
|
http://www.bullopensource.org/ext4/
|
|
+ http://ext4.wiki.kernel.org/index.php/Main_Page
|
|
+ http://fedoraproject.org/wiki/Features/Ext4
|
|
diff --git a/fs/buffer.c b/fs/buffer.c
|
|
index 0f51c0f..5fa1512 100644
|
|
--- a/fs/buffer.c
|
|
+++ b/fs/buffer.c
|
|
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
|
|
*/
|
|
clear_buffer_dirty(bh);
|
|
set_buffer_uptodate(bh);
|
|
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
|
|
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
|
|
+ buffer_dirty(bh)) {
|
|
WARN_ON(bh->b_size != blocksize);
|
|
err = get_block(inode, block, bh, 1);
|
|
if (err)
|
|
goto recover;
|
|
+ clear_buffer_delay(bh);
|
|
if (buffer_new(bh)) {
|
|
/* blockdev mappings never come here */
|
|
clear_buffer_new(bh);
|
|
@@ -1774,7 +1776,8 @@ recover:
|
|
bh = head;
|
|
/* Recovery: lock and submit the mapped buffers */
|
|
do {
|
|
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
|
|
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
|
|
+ !buffer_delay(bh)) {
|
|
lock_buffer(bh);
|
|
mark_buffer_async_write(bh);
|
|
} else {
|
|
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
|
|
struct page *page, void *fsdata)
|
|
{
|
|
struct inode *inode = mapping->host;
|
|
+ int i_size_changed = 0;
|
|
|
|
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
|
|
|
|
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
|
|
*/
|
|
if (pos+copied > inode->i_size) {
|
|
i_size_write(inode, pos+copied);
|
|
- mark_inode_dirty(inode);
|
|
+ i_size_changed = 1;
|
|
}
|
|
|
|
unlock_page(page);
|
|
page_cache_release(page);
|
|
|
|
+ /*
|
|
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
|
|
+ * makes the holding time of page lock longer. Second, it forces lock
|
|
+ * ordering of page lock and transaction start for journaling
|
|
+ * filesystems.
|
|
+ */
|
|
+ if (i_size_changed)
|
|
+ mark_inode_dirty(inode);
|
|
+
|
|
return copied;
|
|
}
|
|
EXPORT_SYMBOL(generic_write_end);
|
|
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
|
|
index 3c8dab8..a234b54 100644
|
|
--- a/fs/ext4/acl.c
|
|
+++ b/fs/ext4/acl.c
|
|
@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
|
|
acl = posix_acl_alloc(count, GFP_NOFS);
|
|
if (!acl)
|
|
return ERR_PTR(-ENOMEM);
|
|
- for (n=0; n < count; n++) {
|
|
+ for (n = 0; n < count; n++) {
|
|
ext4_acl_entry *entry =
|
|
(ext4_acl_entry *)value;
|
|
if ((char *)value + sizeof(ext4_acl_entry_short) > end)
|
|
goto fail;
|
|
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
|
|
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
|
|
- switch(acl->a_entries[n].e_tag) {
|
|
- case ACL_USER_OBJ:
|
|
- case ACL_GROUP_OBJ:
|
|
- case ACL_MASK:
|
|
- case ACL_OTHER:
|
|
- value = (char *)value +
|
|
- sizeof(ext4_acl_entry_short);
|
|
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
|
|
- break;
|
|
-
|
|
- case ACL_USER:
|
|
- case ACL_GROUP:
|
|
- value = (char *)value + sizeof(ext4_acl_entry);
|
|
- if ((char *)value > end)
|
|
- goto fail;
|
|
- acl->a_entries[n].e_id =
|
|
- le32_to_cpu(entry->e_id);
|
|
- break;
|
|
-
|
|
- default:
|
|
+
|
|
+ switch (acl->a_entries[n].e_tag) {
|
|
+ case ACL_USER_OBJ:
|
|
+ case ACL_GROUP_OBJ:
|
|
+ case ACL_MASK:
|
|
+ case ACL_OTHER:
|
|
+ value = (char *)value +
|
|
+ sizeof(ext4_acl_entry_short);
|
|
+ acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
|
|
+ break;
|
|
+
|
|
+ case ACL_USER:
|
|
+ case ACL_GROUP:
|
|
+ value = (char *)value + sizeof(ext4_acl_entry);
|
|
+ if ((char *)value > end)
|
|
goto fail;
|
|
+ acl->a_entries[n].e_id =
|
|
+ le32_to_cpu(entry->e_id);
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ goto fail;
|
|
}
|
|
}
|
|
if (value != end)
|
|
@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
|
|
return ERR_PTR(-ENOMEM);
|
|
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
|
|
e = (char *)ext_acl + sizeof(ext4_acl_header);
|
|
- for (n=0; n < acl->a_count; n++) {
|
|
+ for (n = 0; n < acl->a_count; n++) {
|
|
ext4_acl_entry *entry = (ext4_acl_entry *)e;
|
|
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
|
|
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
|
|
- switch(acl->a_entries[n].e_tag) {
|
|
- case ACL_USER:
|
|
- case ACL_GROUP:
|
|
- entry->e_id =
|
|
- cpu_to_le32(acl->a_entries[n].e_id);
|
|
- e += sizeof(ext4_acl_entry);
|
|
- break;
|
|
-
|
|
- case ACL_USER_OBJ:
|
|
- case ACL_GROUP_OBJ:
|
|
- case ACL_MASK:
|
|
- case ACL_OTHER:
|
|
- e += sizeof(ext4_acl_entry_short);
|
|
- break;
|
|
-
|
|
- default:
|
|
- goto fail;
|
|
+ switch (acl->a_entries[n].e_tag) {
|
|
+ case ACL_USER:
|
|
+ case ACL_GROUP:
|
|
+ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
|
|
+ e += sizeof(ext4_acl_entry);
|
|
+ break;
|
|
+
|
|
+ case ACL_USER_OBJ:
|
|
+ case ACL_GROUP_OBJ:
|
|
+ case ACL_MASK:
|
|
+ case ACL_OTHER:
|
|
+ e += sizeof(ext4_acl_entry_short);
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ goto fail;
|
|
}
|
|
}
|
|
return (char *)ext_acl;
|
|
@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
|
|
if (!test_opt(inode->i_sb, POSIX_ACL))
|
|
return NULL;
|
|
|
|
- switch(type) {
|
|
- case ACL_TYPE_ACCESS:
|
|
- acl = ext4_iget_acl(inode, &ei->i_acl);
|
|
- if (acl != EXT4_ACL_NOT_CACHED)
|
|
- return acl;
|
|
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
|
|
- break;
|
|
-
|
|
- case ACL_TYPE_DEFAULT:
|
|
- acl = ext4_iget_acl(inode, &ei->i_default_acl);
|
|
- if (acl != EXT4_ACL_NOT_CACHED)
|
|
- return acl;
|
|
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
|
- break;
|
|
-
|
|
- default:
|
|
- return ERR_PTR(-EINVAL);
|
|
+ switch (type) {
|
|
+ case ACL_TYPE_ACCESS:
|
|
+ acl = ext4_iget_acl(inode, &ei->i_acl);
|
|
+ if (acl != EXT4_ACL_NOT_CACHED)
|
|
+ return acl;
|
|
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
|
|
+ break;
|
|
+
|
|
+ case ACL_TYPE_DEFAULT:
|
|
+ acl = ext4_iget_acl(inode, &ei->i_default_acl);
|
|
+ if (acl != EXT4_ACL_NOT_CACHED)
|
|
+ return acl;
|
|
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ return ERR_PTR(-EINVAL);
|
|
}
|
|
retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
|
|
if (retval > 0) {
|
|
@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
|
|
kfree(value);
|
|
|
|
if (!IS_ERR(acl)) {
|
|
- switch(type) {
|
|
- case ACL_TYPE_ACCESS:
|
|
- ext4_iset_acl(inode, &ei->i_acl, acl);
|
|
- break;
|
|
-
|
|
- case ACL_TYPE_DEFAULT:
|
|
- ext4_iset_acl(inode, &ei->i_default_acl, acl);
|
|
- break;
|
|
+ switch (type) {
|
|
+ case ACL_TYPE_ACCESS:
|
|
+ ext4_iset_acl(inode, &ei->i_acl, acl);
|
|
+ break;
|
|
+
|
|
+ case ACL_TYPE_DEFAULT:
|
|
+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
|
|
+ break;
|
|
}
|
|
}
|
|
return acl;
|
|
@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
|
|
if (S_ISLNK(inode->i_mode))
|
|
return -EOPNOTSUPP;
|
|
|
|
- switch(type) {
|
|
- case ACL_TYPE_ACCESS:
|
|
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
|
|
- if (acl) {
|
|
- mode_t mode = inode->i_mode;
|
|
- error = posix_acl_equiv_mode(acl, &mode);
|
|
- if (error < 0)
|
|
- return error;
|
|
- else {
|
|
- inode->i_mode = mode;
|
|
- ext4_mark_inode_dirty(handle, inode);
|
|
- if (error == 0)
|
|
- acl = NULL;
|
|
- }
|
|
+ switch (type) {
|
|
+ case ACL_TYPE_ACCESS:
|
|
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
|
|
+ if (acl) {
|
|
+ mode_t mode = inode->i_mode;
|
|
+ error = posix_acl_equiv_mode(acl, &mode);
|
|
+ if (error < 0)
|
|
+ return error;
|
|
+ else {
|
|
+ inode->i_mode = mode;
|
|
+ ext4_mark_inode_dirty(handle, inode);
|
|
+ if (error == 0)
|
|
+ acl = NULL;
|
|
}
|
|
- break;
|
|
+ }
|
|
+ break;
|
|
|
|
- case ACL_TYPE_DEFAULT:
|
|
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
|
- if (!S_ISDIR(inode->i_mode))
|
|
- return acl ? -EACCES : 0;
|
|
- break;
|
|
+ case ACL_TYPE_DEFAULT:
|
|
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
|
+ if (!S_ISDIR(inode->i_mode))
|
|
+ return acl ? -EACCES : 0;
|
|
+ break;
|
|
|
|
- default:
|
|
- return -EINVAL;
|
|
+ default:
|
|
+ return -EINVAL;
|
|
}
|
|
if (acl) {
|
|
value = ext4_acl_to_disk(acl, &size);
|
|
@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
|
|
|
|
kfree(value);
|
|
if (!error) {
|
|
- switch(type) {
|
|
- case ACL_TYPE_ACCESS:
|
|
- ext4_iset_acl(inode, &ei->i_acl, acl);
|
|
- break;
|
|
-
|
|
- case ACL_TYPE_DEFAULT:
|
|
- ext4_iset_acl(inode, &ei->i_default_acl, acl);
|
|
- break;
|
|
+ switch (type) {
|
|
+ case ACL_TYPE_ACCESS:
|
|
+ ext4_iset_acl(inode, &ei->i_acl, acl);
|
|
+ break;
|
|
+
|
|
+ case ACL_TYPE_DEFAULT:
|
|
+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
|
|
+ break;
|
|
}
|
|
}
|
|
return error;
|
|
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
|
|
index 9cc80b9..e9fa960 100644
|
|
--- a/fs/ext4/balloc.c
|
|
+++ b/fs/ext4/balloc.c
|
|
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
|
|
ext4_group_t block_group)
|
|
{
|
|
ext4_group_t actual_group;
|
|
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
|
|
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
|
|
if (actual_group == block_group)
|
|
return 1;
|
|
return 0;
|
|
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
|
|
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
|
|
}
|
|
} else { /* For META_BG_BLOCK_GROUPS */
|
|
- int group_rel = (block_group -
|
|
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
|
|
- EXT4_DESC_PER_BLOCK(sb);
|
|
- if (group_rel == 0 || group_rel == 1 ||
|
|
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
|
|
- bit_max += 1;
|
|
+ bit_max += ext4_bg_num_gdb(sb, block_group);
|
|
}
|
|
|
|
if (block_group == sbi->s_groups_count - 1) {
|
|
@@ -295,7 +290,7 @@ err_out:
|
|
return 0;
|
|
}
|
|
/**
|
|
- * read_block_bitmap()
|
|
+ * ext4_read_block_bitmap()
|
|
* @sb: super block
|
|
* @block_group: given block group
|
|
*
|
|
@@ -305,7 +300,7 @@ err_out:
|
|
* Return buffer_head on success or NULL in case of failure.
|
|
*/
|
|
struct buffer_head *
|
|
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
|
|
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
|
|
{
|
|
struct ext4_group_desc * desc;
|
|
struct buffer_head * bh = NULL;
|
|
@@ -319,25 +314,28 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
|
|
if (unlikely(!bh)) {
|
|
ext4_error(sb, __func__,
|
|
"Cannot read block bitmap - "
|
|
- "block_group = %d, block_bitmap = %llu",
|
|
- (int)block_group, (unsigned long long)bitmap_blk);
|
|
+ "block_group = %lu, block_bitmap = %llu",
|
|
+ block_group, bitmap_blk);
|
|
return NULL;
|
|
}
|
|
if (bh_uptodate_or_lock(bh))
|
|
return bh;
|
|
|
|
+ spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
|
|
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
|
|
ext4_init_block_bitmap(sb, bh, block_group, desc);
|
|
set_buffer_uptodate(bh);
|
|
unlock_buffer(bh);
|
|
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
|
|
return bh;
|
|
}
|
|
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
|
|
if (bh_submit_read(bh) < 0) {
|
|
put_bh(bh);
|
|
ext4_error(sb, __func__,
|
|
"Cannot read block bitmap - "
|
|
- "block_group = %d, block_bitmap = %llu",
|
|
- (int)block_group, (unsigned long long)bitmap_blk);
|
|
+ "block_group = %lu, block_bitmap = %llu",
|
|
+ block_group, bitmap_blk);
|
|
return NULL;
|
|
}
|
|
ext4_valid_block_bitmap(sb, desc, block_group, bh);
|
|
@@ -409,8 +407,7 @@ restart:
|
|
prev = rsv;
|
|
}
|
|
printk("Window map complete.\n");
|
|
- if (bad)
|
|
- BUG();
|
|
+ BUG_ON(bad);
|
|
}
|
|
#define rsv_window_dump(root, verbose) \
|
|
__rsv_window_dump((root), (verbose), __func__)
|
|
@@ -694,7 +691,7 @@ do_more:
|
|
count -= overflow;
|
|
}
|
|
brelse(bitmap_bh);
|
|
- bitmap_bh = read_block_bitmap(sb, block_group);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
|
|
if (!bitmap_bh)
|
|
goto error_return;
|
|
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
|
|
@@ -810,6 +807,13 @@ do_more:
|
|
spin_unlock(sb_bgl_lock(sbi, block_group));
|
|
percpu_counter_add(&sbi->s_freeblocks_counter, count);
|
|
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
|
|
+ spin_lock(sb_bgl_lock(sbi, flex_group));
|
|
+ sbi->s_flex_groups[flex_group].free_blocks += count;
|
|
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
|
|
+ }
|
|
+
|
|
/* We dirtied the bitmap block */
|
|
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
|
|
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
|
|
@@ -1598,23 +1602,38 @@ out:
|
|
|
|
/**
|
|
* ext4_has_free_blocks()
|
|
- * @sbi: in-core super block structure.
|
|
+ * @sbi: in-core super block structure.
|
|
+ * @nblocks: number of neeed blocks
|
|
*
|
|
- * Check if filesystem has at least 1 free block available for allocation.
|
|
+ * Check if filesystem has free blocks available for allocation.
|
|
+ * Return the number of blocks avaible for allocation for this request
|
|
+ * On success, return nblocks
|
|
*/
|
|
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
|
|
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
|
|
+ ext4_fsblk_t nblocks)
|
|
{
|
|
- ext4_fsblk_t free_blocks, root_blocks;
|
|
+ ext4_fsblk_t free_blocks;
|
|
+ ext4_fsblk_t root_blocks = 0;
|
|
|
|
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
|
|
- root_blocks = ext4_r_blocks_count(sbi->s_es);
|
|
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
|
|
+
|
|
+ if (!capable(CAP_SYS_RESOURCE) &&
|
|
sbi->s_resuid != current->fsuid &&
|
|
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
|
|
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
|
|
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
|
|
+#ifdef CONFIG_SMP
|
|
+ if (free_blocks - root_blocks < FBC_BATCH)
|
|
+ free_blocks =
|
|
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
|
|
+#endif
|
|
+ if (free_blocks <= root_blocks)
|
|
+ /* we don't have free space */
|
|
return 0;
|
|
- }
|
|
- return 1;
|
|
-}
|
|
+ if (free_blocks - root_blocks < nblocks)
|
|
+ return free_blocks - root_blocks;
|
|
+ return nblocks;
|
|
+ }
|
|
+
|
|
|
|
/**
|
|
* ext4_should_retry_alloc()
|
|
@@ -1630,7 +1649,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
|
|
*/
|
|
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
|
|
{
|
|
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
|
|
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
|
|
return 0;
|
|
|
|
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
|
|
@@ -1639,20 +1658,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
|
|
}
|
|
|
|
/**
|
|
- * ext4_new_blocks_old() -- core block(s) allocation function
|
|
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
|
|
+ *
|
|
* @handle: handle to this transaction
|
|
* @inode: file inode
|
|
* @goal: given target block(filesystem wide)
|
|
* @count: target number of blocks to allocate
|
|
* @errp: error code
|
|
*
|
|
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
|
|
- * allocate block(s) from the block group contains the goal block first. If that
|
|
- * fails, it will try to allocate block(s) from other block groups without
|
|
- * any specific goal block.
|
|
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
|
|
+ * the block bitmap directly to do block allocation. It tries to
|
|
+ * allocate block(s) from the block group contains the goal block first. If
|
|
+ * that fails, it will try to allocate block(s) from other block groups
|
|
+ * without any specific goal block.
|
|
+ *
|
|
+ * This function is called when -o nomballoc mount option is enabled
|
|
*
|
|
*/
|
|
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
|
|
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t goal, unsigned long *count, int *errp)
|
|
{
|
|
struct buffer_head *bitmap_bh = NULL;
|
|
@@ -1676,13 +1699,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
|
|
ext4_group_t ngroups;
|
|
unsigned long num = *count;
|
|
|
|
- *errp = -ENOSPC;
|
|
sb = inode->i_sb;
|
|
if (!sb) {
|
|
+ *errp = -ENODEV;
|
|
printk("ext4_new_block: nonexistent device");
|
|
return 0;
|
|
}
|
|
|
|
+ sbi = EXT4_SB(sb);
|
|
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
|
|
+ /*
|
|
+ * With delalloc we already reserved the blocks
|
|
+ */
|
|
+ *count = ext4_has_free_blocks(sbi, *count);
|
|
+ }
|
|
+ if (*count == 0) {
|
|
+ *errp = -ENOSPC;
|
|
+ return 0; /*return with ENOSPC error */
|
|
+ }
|
|
+ num = *count;
|
|
+
|
|
/*
|
|
* Check quota for allocation of this block.
|
|
*/
|
|
@@ -1706,11 +1742,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
|
|
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
|
|
my_rsv = &block_i->rsv_window_node;
|
|
|
|
- if (!ext4_has_free_blocks(sbi)) {
|
|
- *errp = -ENOSPC;
|
|
- goto out;
|
|
- }
|
|
-
|
|
/*
|
|
* First, test whether the goal block is free.
|
|
*/
|
|
@@ -1734,7 +1765,7 @@ retry_alloc:
|
|
my_rsv = NULL;
|
|
|
|
if (free_blocks > 0) {
|
|
- bitmap_bh = read_block_bitmap(sb, group_no);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
|
|
if (!bitmap_bh)
|
|
goto io_error;
|
|
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
|
|
@@ -1770,7 +1801,7 @@ retry_alloc:
|
|
continue;
|
|
|
|
brelse(bitmap_bh);
|
|
- bitmap_bh = read_block_bitmap(sb, group_no);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
|
|
if (!bitmap_bh)
|
|
goto io_error;
|
|
/*
|
|
@@ -1882,7 +1913,15 @@ allocated:
|
|
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
|
|
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
|
|
spin_unlock(sb_bgl_lock(sbi, group_no));
|
|
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
|
|
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
|
|
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
|
|
+
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
|
|
+ spin_lock(sb_bgl_lock(sbi, flex_group));
|
|
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
|
|
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
|
|
+ }
|
|
|
|
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
|
|
err = ext4_journal_dirty_metadata(handle, gdp_bh);
|
|
@@ -1915,46 +1954,104 @@ out:
|
|
return 0;
|
|
}
|
|
|
|
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
|
|
- ext4_fsblk_t goal, int *errp)
|
|
+#define EXT4_META_BLOCK 0x1
|
|
+
|
|
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
|
|
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
|
|
+ unsigned long *count, int *errp, int flags)
|
|
{
|
|
struct ext4_allocation_request ar;
|
|
ext4_fsblk_t ret;
|
|
|
|
if (!test_opt(inode->i_sb, MBALLOC)) {
|
|
- unsigned long count = 1;
|
|
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
|
|
- return ret;
|
|
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
|
|
}
|
|
|
|
memset(&ar, 0, sizeof(ar));
|
|
+ /* Fill with neighbour allocated blocks */
|
|
+
|
|
ar.inode = inode;
|
|
ar.goal = goal;
|
|
- ar.len = 1;
|
|
+ ar.len = *count;
|
|
+ ar.logical = iblock;
|
|
+
|
|
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
|
|
+ /* enable in-core preallocation for data block allocation */
|
|
+ ar.flags = EXT4_MB_HINT_DATA;
|
|
+ else
|
|
+ /* disable in-core preallocation for non-regular files */
|
|
+ ar.flags = 0;
|
|
+
|
|
ret = ext4_mb_new_blocks(handle, &ar, errp);
|
|
+ *count = ar.len;
|
|
return ret;
|
|
}
|
|
|
|
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
|
|
+/*
|
|
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
|
|
+ *
|
|
+ * @handle: handle to this transaction
|
|
+ * @inode: file inode
|
|
+ * @goal: given target block(filesystem wide)
|
|
+ * @count: total number of blocks need
|
|
+ * @errp: error code
|
|
+ *
|
|
+ * Return 1st allocated block numberon success, *count stores total account
|
|
+ * error stores in errp pointer
|
|
+ */
|
|
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t goal, unsigned long *count, int *errp)
|
|
{
|
|
- struct ext4_allocation_request ar;
|
|
ext4_fsblk_t ret;
|
|
-
|
|
- if (!test_opt(inode->i_sb, MBALLOC)) {
|
|
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
|
|
- return ret;
|
|
+ ret = do_blk_alloc(handle, inode, 0, goal,
|
|
+ count, errp, EXT4_META_BLOCK);
|
|
+ /*
|
|
+ * Account for the allocated meta blocks
|
|
+ */
|
|
+ if (!(*errp)) {
|
|
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
}
|
|
-
|
|
- memset(&ar, 0, sizeof(ar));
|
|
- ar.inode = inode;
|
|
- ar.goal = goal;
|
|
- ar.len = *count;
|
|
- ret = ext4_mb_new_blocks(handle, &ar, errp);
|
|
- *count = ar.len;
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
|
|
+ *
|
|
+ * @handle: handle to this transaction
|
|
+ * @inode: file inode
|
|
+ * @goal: given target block(filesystem wide)
|
|
+ * @errp: error code
|
|
+ *
|
|
+ * Return allocated block number on success
|
|
+ */
|
|
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
|
|
+ ext4_fsblk_t goal, int *errp)
|
|
+{
|
|
+ unsigned long count = 1;
|
|
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * ext4_new_blocks() -- allocate data blocks
|
|
+ *
|
|
+ * @handle: handle to this transaction
|
|
+ * @inode: file inode
|
|
+ * @goal: given target block(filesystem wide)
|
|
+ * @count: total number of blocks need
|
|
+ * @errp: error code
|
|
+ *
|
|
+ * Return 1st allocated block numberon success, *count stores total account
|
|
+ * error stores in errp pointer
|
|
+ */
|
|
+
|
|
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
|
|
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
|
|
+ unsigned long *count, int *errp)
|
|
+{
|
|
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
|
|
+}
|
|
|
|
/**
|
|
* ext4_count_free_blocks() -- count filesystem free blocks
|
|
@@ -1986,7 +2083,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
|
|
continue;
|
|
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
|
|
brelse(bitmap_bh);
|
|
- bitmap_bh = read_block_bitmap(sb, i);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
|
|
if (bitmap_bh == NULL)
|
|
continue;
|
|
|
|
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
|
|
index 2bf0331..ec8e33b 100644
|
|
--- a/fs/ext4/dir.c
|
|
+++ b/fs/ext4/dir.c
|
|
@@ -130,7 +130,8 @@ static int ext4_readdir(struct file * filp,
|
|
struct buffer_head *bh = NULL;
|
|
|
|
map_bh.b_state = 0;
|
|
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
|
|
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
|
|
+ 0, 0, 0);
|
|
if (err > 0) {
|
|
pgoff_t index = map_bh.b_blocknr >>
|
|
(PAGE_CACHE_SHIFT - inode->i_blkbits);
|
|
@@ -277,7 +278,7 @@ static void free_rb_tree_fname(struct rb_root *root)
|
|
|
|
while (n) {
|
|
/* Do the node's children first */
|
|
- if ((n)->rb_left) {
|
|
+ if (n->rb_left) {
|
|
n = n->rb_left;
|
|
continue;
|
|
}
|
|
@@ -306,24 +307,18 @@ static void free_rb_tree_fname(struct rb_root *root)
|
|
parent->rb_right = NULL;
|
|
n = parent;
|
|
}
|
|
- root->rb_node = NULL;
|
|
}
|
|
|
|
|
|
-static struct dir_private_info *create_dir_info(loff_t pos)
|
|
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
|
|
{
|
|
struct dir_private_info *p;
|
|
|
|
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
|
|
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
|
|
if (!p)
|
|
return NULL;
|
|
- p->root.rb_node = NULL;
|
|
- p->curr_node = NULL;
|
|
- p->extra_fname = NULL;
|
|
- p->last_pos = 0;
|
|
p->curr_hash = pos2maj_hash(pos);
|
|
p->curr_minor_hash = pos2min_hash(pos);
|
|
- p->next_hash = 0;
|
|
return p;
|
|
}
|
|
|
|
@@ -421,7 +416,7 @@ static int call_filldir(struct file * filp, void * dirent,
|
|
get_dtype(sb, fname->file_type));
|
|
if (error) {
|
|
filp->f_pos = curr_pos;
|
|
- info->extra_fname = fname->next;
|
|
+ info->extra_fname = fname;
|
|
return error;
|
|
}
|
|
fname = fname->next;
|
|
@@ -438,7 +433,7 @@ static int ext4_dx_readdir(struct file * filp,
|
|
int ret;
|
|
|
|
if (!info) {
|
|
- info = create_dir_info(filp->f_pos);
|
|
+ info = ext4_htree_create_dir_info(filp->f_pos);
|
|
if (!info)
|
|
return -ENOMEM;
|
|
filp->private_data = info;
|
|
@@ -460,11 +455,21 @@ static int ext4_dx_readdir(struct file * filp,
|
|
* If there are any leftover names on the hash collision
|
|
* chain, return them first.
|
|
*/
|
|
- if (info->extra_fname &&
|
|
- call_filldir(filp, dirent, filldir, info->extra_fname))
|
|
- goto finished;
|
|
+ if (info->extra_fname) {
|
|
+ if (call_filldir(filp, dirent, filldir, info->extra_fname))
|
|
+ goto finished;
|
|
|
|
- if (!info->curr_node)
|
|
+ info->extra_fname = NULL;
|
|
+ info->curr_node = rb_next(info->curr_node);
|
|
+ if (!info->curr_node) {
|
|
+ if (info->next_hash == ~0) {
|
|
+ filp->f_pos = EXT4_HTREE_EOF;
|
|
+ goto finished;
|
|
+ }
|
|
+ info->curr_hash = info->next_hash;
|
|
+ info->curr_minor_hash = 0;
|
|
+ }
|
|
+ } else if (!info->curr_node)
|
|
info->curr_node = rb_first(&info->root);
|
|
|
|
while (1) {
|
|
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
|
|
index 8158083..2950032 100644
|
|
--- a/fs/ext4/ext4.h
|
|
+++ b/fs/ext4/ext4.h
|
|
@@ -22,7 +22,7 @@
|
|
#include "ext4_i.h"
|
|
|
|
/*
|
|
- * The second extended filesystem constants/structures
|
|
+ * The fourth extended filesystem constants/structures
|
|
*/
|
|
|
|
/*
|
|
@@ -45,7 +45,7 @@
|
|
#define ext4_debug(f, a...) \
|
|
do { \
|
|
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
|
|
- __FILE__, __LINE__, __FUNCTION__); \
|
|
+ __FILE__, __LINE__, __func__); \
|
|
printk (KERN_DEBUG f, ## a); \
|
|
} while (0)
|
|
#else
|
|
@@ -74,6 +74,9 @@
|
|
#define EXT4_MB_HINT_GOAL_ONLY 256
|
|
/* goal is meaningful */
|
|
#define EXT4_MB_HINT_TRY_GOAL 512
|
|
+/* blocks already pre-reserved by delayed allocation */
|
|
+#define EXT4_MB_DELALLOC_RESERVED 1024
|
|
+
|
|
|
|
struct ext4_allocation_request {
|
|
/* target inode for block we're allocating */
|
|
@@ -170,6 +173,15 @@ struct ext4_group_desc
|
|
__u32 bg_reserved2[3];
|
|
};
|
|
|
|
+/*
|
|
+ * Structure of a flex block group info
|
|
+ */
|
|
+
|
|
+struct flex_groups {
|
|
+ __u32 free_inodes;
|
|
+ __u32 free_blocks;
|
|
+};
|
|
+
|
|
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
|
|
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
|
|
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
|
|
@@ -527,6 +539,7 @@ do { \
|
|
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
|
|
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
|
|
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
|
|
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
|
|
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
|
|
#ifndef _LINUX_EXT2_FS_H
|
|
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
|
|
@@ -647,7 +660,10 @@ struct ext4_super_block {
|
|
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
|
|
__le64 s_mmp_block; /* Block for multi-mount protection */
|
|
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
|
|
- __u32 s_reserved[163]; /* Padding to the end of the block */
|
|
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
|
|
+ __u8 s_reserved_char_pad2;
|
|
+ __le16 s_reserved_pad;
|
|
+ __u32 s_reserved[162]; /* Padding to the end of the block */
|
|
};
|
|
|
|
#ifdef __KERNEL__
|
|
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
|
|
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
|
|
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
|
|
ext4_group_t group);
|
|
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
|
|
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t goal, int *errp);
|
|
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
|
|
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t goal, unsigned long *count, int *errp);
|
|
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
|
|
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
|
|
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
|
|
+ unsigned long *count, int *errp);
|
|
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t goal, unsigned long *count, int *errp);
|
|
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
|
|
+ ext4_fsblk_t nblocks);
|
|
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t block, unsigned long count, int metadata);
|
|
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
|
|
@@ -1016,6 +1037,10 @@ extern int __init init_ext4_mballoc(void);
|
|
extern void exit_ext4_mballoc(void);
|
|
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
|
|
unsigned long, unsigned long, int, unsigned long *);
|
|
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
|
|
+ ext4_group_t i, struct ext4_group_desc *desc);
|
|
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
|
|
+ ext4_grpblk_t add);
|
|
|
|
|
|
/* inode.c */
|
|
@@ -1033,19 +1058,25 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
|
|
extern struct inode *ext4_iget(struct super_block *, unsigned long);
|
|
extern int ext4_write_inode (struct inode *, int);
|
|
extern int ext4_setattr (struct dentry *, struct iattr *);
|
|
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|
+ struct kstat *stat);
|
|
extern void ext4_delete_inode (struct inode *);
|
|
extern int ext4_sync_inode (handle_t *, struct inode *);
|
|
extern void ext4_discard_reservation (struct inode *);
|
|
extern void ext4_dirty_inode(struct inode *);
|
|
extern int ext4_change_inode_journal_flag(struct inode *, int);
|
|
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
|
|
+extern int ext4_can_truncate(struct inode *inode);
|
|
extern void ext4_truncate (struct inode *);
|
|
extern void ext4_set_inode_flags(struct inode *);
|
|
extern void ext4_get_inode_flags(struct ext4_inode_info *);
|
|
extern void ext4_set_aops(struct inode *inode);
|
|
extern int ext4_writepage_trans_blocks(struct inode *);
|
|
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
|
|
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
|
|
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
|
|
+extern int ext4_block_truncate_page(handle_t *handle,
|
|
struct address_space *mapping, loff_t from);
|
|
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
|
|
|
|
/* ioctl.c */
|
|
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
|
|
@@ -1159,10 +1190,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
|
|
}
|
|
|
|
|
|
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
|
|
+ ext4_group_t block_group)
|
|
+{
|
|
+ return block_group >> sbi->s_log_groups_per_flex;
|
|
+}
|
|
+
|
|
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
|
|
+{
|
|
+ return 1 << sbi->s_log_groups_per_flex;
|
|
+}
|
|
+
|
|
#define ext4_std_error(sb, errno) \
|
|
do { \
|
|
if ((errno)) \
|
|
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
|
|
+ __ext4_std_error((sb), __func__, (errno)); \
|
|
} while (0)
|
|
|
|
/*
|
|
@@ -1187,11 +1229,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
|
|
/* extents.c */
|
|
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
|
|
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
|
|
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
|
|
+ int chunk);
|
|
extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
|
|
ext4_lblk_t iblock,
|
|
unsigned long max_blocks, struct buffer_head *bh_result,
|
|
int create, int extend_disksize);
|
|
-extern void ext4_ext_truncate(struct inode *, struct page *);
|
|
+extern void ext4_ext_truncate(struct inode *);
|
|
extern void ext4_ext_init(struct super_block *);
|
|
extern void ext4_ext_release(struct super_block *);
|
|
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
|
|
@@ -1199,7 +1243,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
|
|
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
|
|
sector_t block, unsigned long max_blocks,
|
|
struct buffer_head *bh, int create,
|
|
- int extend_disksize);
|
|
+ int extend_disksize, int flag);
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _EXT4_H */
|
|
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
|
|
index 75333b5..d33dc56 100644
|
|
--- a/fs/ext4/ext4_extents.h
|
|
+++ b/fs/ext4/ext4_extents.h
|
|
@@ -212,10 +212,13 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
|
|
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
|
|
}
|
|
|
|
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
|
|
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
|
|
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
|
|
extern int ext4_extent_tree_init(handle_t *, struct inode *);
|
|
-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
|
|
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
|
|
+ int num,
|
|
+ struct ext4_ext_path *path);
|
|
extern int ext4_ext_try_to_merge(struct inode *inode,
|
|
struct ext4_ext_path *path,
|
|
struct ext4_extent *);
|
|
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
|
|
index 26a4ae2..ef7409f 100644
|
|
--- a/fs/ext4/ext4_i.h
|
|
+++ b/fs/ext4/ext4_i.h
|
|
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
|
|
};
|
|
|
|
/*
|
|
- * third extended file system inode data in memory
|
|
+ * fourth extended file system inode data in memory
|
|
*/
|
|
struct ext4_inode_info {
|
|
__le32 i_data[15]; /* unconverted */
|
|
@@ -150,6 +150,7 @@ struct ext4_inode_info {
|
|
*/
|
|
struct rw_semaphore i_data_sem;
|
|
struct inode vfs_inode;
|
|
+ struct jbd2_inode jinode;
|
|
|
|
unsigned long i_ext_generation;
|
|
struct ext4_ext_cache i_cached_extent;
|
|
@@ -162,6 +163,13 @@ struct ext4_inode_info {
|
|
/* mballoc */
|
|
struct list_head i_prealloc_list;
|
|
spinlock_t i_prealloc_lock;
|
|
+
|
|
+ /* allocation reservation info for delalloc */
|
|
+ unsigned long i_reserved_data_blocks;
|
|
+ unsigned long i_reserved_meta_blocks;
|
|
+ unsigned long i_allocated_meta_blocks;
|
|
+ unsigned short i_delalloc_reserved_flag;
|
|
+ spinlock_t i_block_reservation_lock;
|
|
};
|
|
|
|
#endif /* _EXT4_I */
|
|
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
|
|
index 9255a7d..b455c68 100644
|
|
--- a/fs/ext4/ext4_jbd2.h
|
|
+++ b/fs/ext4/ext4_jbd2.h
|
|
@@ -51,6 +51,14 @@
|
|
EXT4_XATTR_TRANS_BLOCKS - 2 + \
|
|
2*EXT4_QUOTA_TRANS_BLOCKS(sb))
|
|
|
|
+/*
|
|
+ * Define the number of metadata blocks we need to account to modify data.
|
|
+ *
|
|
+ * This include super block, inode block, quota blocks and xattr blocks
|
|
+ */
|
|
+#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
|
|
+ 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
|
|
+
|
|
/* Delete operations potentially hit one directory's namespace plus an
|
|
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
|
|
* generous. We can grow the delete transaction later if necessary. */
|
|
@@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where,
|
|
handle_t *handle, struct buffer_head *bh);
|
|
|
|
#define ext4_journal_get_undo_access(handle, bh) \
|
|
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
|
|
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
|
|
#define ext4_journal_get_write_access(handle, bh) \
|
|
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
|
|
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
|
|
#define ext4_journal_revoke(handle, blocknr, bh) \
|
|
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
|
|
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
|
|
#define ext4_journal_get_create_access(handle, bh) \
|
|
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
|
|
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
|
|
#define ext4_journal_dirty_metadata(handle, bh) \
|
|
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
|
|
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
|
|
#define ext4_journal_forget(handle, bh) \
|
|
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
|
|
-
|
|
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
|
|
+ __ext4_journal_forget(__func__, (handle), (bh))
|
|
|
|
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
|
|
int __ext4_journal_stop(const char *where, handle_t *handle);
|
|
@@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
|
|
}
|
|
|
|
#define ext4_journal_stop(handle) \
|
|
- __ext4_journal_stop(__FUNCTION__, (handle))
|
|
+ __ext4_journal_stop(__func__, (handle))
|
|
|
|
static inline handle_t *ext4_journal_current_handle(void)
|
|
{
|
|
@@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
|
|
return jbd2_journal_force_commit(journal);
|
|
}
|
|
|
|
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
|
|
+{
|
|
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
|
|
+}
|
|
+
|
|
/* super.c */
|
|
int ext4_force_commit(struct super_block *sb);
|
|
|
|
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
|
|
index 5802e69..6300226 100644
|
|
--- a/fs/ext4/ext4_sb.h
|
|
+++ b/fs/ext4/ext4_sb.h
|
|
@@ -25,7 +25,7 @@
|
|
#include <linux/rbtree.h>
|
|
|
|
/*
|
|
- * third extended-fs super-block data in memory
|
|
+ * fourth extended-fs super-block data in memory
|
|
*/
|
|
struct ext4_sb_info {
|
|
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
|
|
@@ -143,6 +143,9 @@ struct ext4_sb_info {
|
|
|
|
/* locality groups */
|
|
struct ext4_locality_group *s_locality_groups;
|
|
+
|
|
+ unsigned int s_log_groups_per_flex;
|
|
+ struct flex_groups *s_flex_groups;
|
|
};
|
|
|
|
#endif /* _EXT4_SB */
|
|
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
|
|
index 47929c4..b24d3c5 100644
|
|
--- a/fs/ext4/extents.c
|
|
+++ b/fs/ext4/extents.c
|
|
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
|
|
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
|
|
}
|
|
|
|
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
|
|
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
|
|
{
|
|
int err;
|
|
|
|
if (handle->h_buffer_credits > needed)
|
|
- return handle;
|
|
- if (!ext4_journal_extend(handle, needed))
|
|
- return handle;
|
|
- err = ext4_journal_restart(handle, needed);
|
|
-
|
|
- return handle;
|
|
+ return 0;
|
|
+ err = ext4_journal_extend(handle, needed);
|
|
+ if (err <= 0)
|
|
+ return err;
|
|
+ return ext4_journal_restart(handle, needed);
|
|
}
|
|
|
|
/*
|
|
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
|
|
return bg_start + colour + block;
|
|
}
|
|
|
|
+/*
|
|
+ * Allocation for a meta data block
|
|
+ */
|
|
static ext4_fsblk_t
|
|
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
|
|
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
|
|
struct ext4_ext_path *path,
|
|
struct ext4_extent *ex, int *err)
|
|
{
|
|
ext4_fsblk_t goal, newblock;
|
|
|
|
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
|
|
- newblock = ext4_new_block(handle, inode, goal, err);
|
|
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
|
|
return newblock;
|
|
}
|
|
|
|
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
|
|
return size;
|
|
}
|
|
|
|
+/*
|
|
+ * Calculate the number of metadata blocks needed
|
|
+ * to allocate @blocks
|
|
+ * Worse case is one block per extent
|
|
+ */
|
|
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
|
|
+{
|
|
+ int lcap, icap, rcap, leafs, idxs, num;
|
|
+ int newextents = blocks;
|
|
+
|
|
+ rcap = ext4_ext_space_root_idx(inode);
|
|
+ lcap = ext4_ext_space_block(inode);
|
|
+ icap = ext4_ext_space_block_idx(inode);
|
|
+
|
|
+ /* number of new leaf blocks needed */
|
|
+ num = leafs = (newextents + lcap - 1) / lcap;
|
|
+
|
|
+ /*
|
|
+ * Worse case, we need separate index block(s)
|
|
+ * to link all new leaf blocks
|
|
+ */
|
|
+ idxs = (leafs + icap - 1) / icap;
|
|
+ do {
|
|
+ num += idxs;
|
|
+ idxs = (idxs + icap - 1) / icap;
|
|
+ } while (idxs > rcap);
|
|
+
|
|
+ return num;
|
|
+}
|
|
+
|
|
static int
|
|
ext4_ext_max_entries(struct inode *inode, int depth)
|
|
{
|
|
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
|
|
alloc = 1;
|
|
}
|
|
path[0].p_hdr = eh;
|
|
+ path[0].p_bh = NULL;
|
|
|
|
i = depth;
|
|
/* walk through the tree */
|
|
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
|
|
}
|
|
|
|
path[ppos].p_depth = i;
|
|
- path[ppos].p_hdr = eh;
|
|
path[ppos].p_ext = NULL;
|
|
path[ppos].p_idx = NULL;
|
|
|
|
/* find extent */
|
|
ext4_ext_binsearch(inode, path + ppos, block);
|
|
+ /* if not an empty leaf */
|
|
+ if (path[ppos].p_ext)
|
|
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
|
|
|
|
ext4_ext_show_path(inode, path);
|
|
|
|
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
|
|
/* allocate all needed blocks */
|
|
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
|
|
for (a = 0; a < depth - at; a++) {
|
|
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
|
|
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
|
|
+ newext, &err);
|
|
if (newblock == 0)
|
|
goto cleanup;
|
|
ablocks[a] = newblock;
|
|
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t newblock;
|
|
int err = 0;
|
|
|
|
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
|
|
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
|
|
if (newblock == 0)
|
|
return err;
|
|
|
|
@@ -981,6 +1017,8 @@ repeat:
|
|
/* if we found index with free entry, then use that
|
|
* entry: create all needed subtree and add new leaf */
|
|
err = ext4_ext_split(handle, inode, path, newext, i);
|
|
+ if (err)
|
|
+ goto out;
|
|
|
|
/* refill path */
|
|
ext4_ext_drop_refs(path);
|
|
@@ -1403,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
|
|
|
|
/*
|
|
* get the next allocated block if the extent in the path
|
|
- * is before the requested block(s)
|
|
+ * is before the requested block(s)
|
|
*/
|
|
if (b2 < b1) {
|
|
b2 = ext4_ext_next_allocated_block(path);
|
|
@@ -1709,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
|
|
}
|
|
|
|
/*
|
|
- * ext4_ext_calc_credits_for_insert:
|
|
- * This routine returns max. credits that the extent tree can consume.
|
|
- * It should be OK for low-performance paths like ->writepage()
|
|
- * To allow many writing processes to fit into a single transaction,
|
|
- * the caller should calculate credits under i_data_sem and
|
|
- * pass the actual path.
|
|
+ * ext4_ext_calc_credits_for_single_extent:
|
|
+ * This routine returns max. credits that needed to insert an extent
|
|
+ * to the extent tree.
|
|
+ * When pass the actual path, the caller should calculate credits
|
|
+ * under i_data_sem.
|
|
*/
|
|
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
|
|
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
|
|
struct ext4_ext_path *path)
|
|
{
|
|
- int depth, needed;
|
|
-
|
|
if (path) {
|
|
+ int depth = ext_depth(inode);
|
|
+ int ret = 0;
|
|
+
|
|
/* probably there is space in leaf? */
|
|
- depth = ext_depth(inode);
|
|
if (le16_to_cpu(path[depth].p_hdr->eh_entries)
|
|
- < le16_to_cpu(path[depth].p_hdr->eh_max))
|
|
- return 1;
|
|
- }
|
|
+ < le16_to_cpu(path[depth].p_hdr->eh_max)) {
|
|
|
|
- /*
|
|
- * given 32-bit logical block (4294967296 blocks), max. tree
|
|
- * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
|
|
- * Let's also add one more level for imbalance.
|
|
- */
|
|
- depth = 5;
|
|
-
|
|
- /* allocation of new data block(s) */
|
|
- needed = 2;
|
|
+ /*
|
|
+ * There are some space in the leaf tree, no
|
|
+ * need to account for leaf block credit
|
|
+ *
|
|
+ * bitmaps and block group descriptor blocks
|
|
+ * and other metadat blocks still need to be
|
|
+ * accounted.
|
|
+ */
|
|
+ /* 1 bitmap, 1 block group descriptor */
|
|
+ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
|
|
+ }
|
|
+ }
|
|
|
|
- /*
|
|
- * tree can be full, so it would need to grow in depth:
|
|
- * we need one credit to modify old root, credits for
|
|
- * new root will be added in split accounting
|
|
- */
|
|
- needed += 1;
|
|
+ return ext4_chunk_trans_blocks(inode, nrblocks);
|
|
+}
|
|
|
|
- /*
|
|
- * Index split can happen, we would need:
|
|
- * allocate intermediate indexes (bitmap + group)
|
|
- * + change two blocks at each level, but root (already included)
|
|
- */
|
|
- needed += (depth * 2) + (depth * 2);
|
|
+/*
|
|
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
|
|
+ *
|
|
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
|
|
+ * in the worse case, each tree level index/leaf need to be changed
|
|
+ * if the tree split due to insert a new extent, then the old tree
|
|
+ * index/leaf need to be updated too
|
|
+ *
|
|
+ * If the nrblocks are discontiguous, they could cause
|
|
+ * the whole tree split more than once, but this is really rare.
|
|
+ */
|
|
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
|
|
+{
|
|
+ int index;
|
|
+ int depth = ext_depth(inode);
|
|
|
|
- /* any allocation modifies superblock */
|
|
- needed += 1;
|
|
+ if (chunk)
|
|
+ index = depth * 2;
|
|
+ else
|
|
+ index = depth * 3;
|
|
|
|
- return needed;
|
|
+ return index;
|
|
}
|
|
|
|
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
|
|
@@ -1872,22 +1917,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
|
BUG_ON(b != ex_ee_block + ex_ee_len - 1);
|
|
}
|
|
|
|
- /* at present, extent can't cross block group: */
|
|
- /* leaf + bitmap + group desc + sb + inode */
|
|
- credits = 5;
|
|
+ /*
|
|
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
|
|
+ * descriptor) for each block group; assume two block
|
|
+ * groups plus ex_ee_len/blocks_per_block_group for
|
|
+ * the worst case
|
|
+ */
|
|
+ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
|
|
if (ex == EXT_FIRST_EXTENT(eh)) {
|
|
correct_index = 1;
|
|
credits += (ext_depth(inode)) + 1;
|
|
}
|
|
-#ifdef CONFIG_QUOTA
|
|
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
|
|
-#endif
|
|
|
|
- handle = ext4_ext_journal_restart(handle, credits);
|
|
- if (IS_ERR(handle)) {
|
|
- err = PTR_ERR(handle);
|
|
+ err = ext4_ext_journal_restart(handle, credits);
|
|
+ if (err)
|
|
goto out;
|
|
- }
|
|
|
|
err = ext4_ext_get_access(handle, inode, path + depth);
|
|
if (err)
|
|
@@ -2287,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|
unsigned int newdepth;
|
|
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
|
|
if (allocated <= EXT4_EXT_ZERO_LEN) {
|
|
- /* Mark first half uninitialized.
|
|
+ /*
|
|
+ * iblock == ee_block is handled by the zerouout
|
|
+ * at the beginning.
|
|
+ * Mark first half uninitialized.
|
|
* Mark second half initialized and zero out the
|
|
* initialized extent
|
|
*/
|
|
@@ -2310,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|
ex->ee_len = orig_ex.ee_len;
|
|
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
|
|
ext4_ext_dirty(handle, inode, path + depth);
|
|
- /* zeroed the full extent */
|
|
+ /* blocks available from iblock */
|
|
return allocated;
|
|
|
|
} else if (err)
|
|
@@ -2338,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|
err = PTR_ERR(path);
|
|
return err;
|
|
}
|
|
+ /* get the second half extent details */
|
|
ex = path[depth].p_ext;
|
|
err = ext4_ext_get_access(handle, inode,
|
|
path + depth);
|
|
@@ -2367,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
|
|
ext4_ext_dirty(handle, inode, path + depth);
|
|
/* zeroed the full extent */
|
|
+ /* blocks available from iblock */
|
|
return allocated;
|
|
|
|
} else if (err)
|
|
@@ -2382,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|
*/
|
|
orig_ex.ee_len = cpu_to_le16(ee_len -
|
|
ext4_ext_get_actual_len(ex3));
|
|
- if (newdepth != depth) {
|
|
- depth = newdepth;
|
|
- ext4_ext_drop_refs(path);
|
|
- path = ext4_ext_find_extent(inode, iblock, path);
|
|
- if (IS_ERR(path)) {
|
|
- err = PTR_ERR(path);
|
|
- goto out;
|
|
- }
|
|
- eh = path[depth].p_hdr;
|
|
- ex = path[depth].p_ext;
|
|
- if (ex2 != &newex)
|
|
- ex2 = ex;
|
|
-
|
|
- err = ext4_ext_get_access(handle, inode, path + depth);
|
|
- if (err)
|
|
- goto out;
|
|
+ depth = newdepth;
|
|
+ ext4_ext_drop_refs(path);
|
|
+ path = ext4_ext_find_extent(inode, iblock, path);
|
|
+ if (IS_ERR(path)) {
|
|
+ err = PTR_ERR(path);
|
|
+ goto out;
|
|
}
|
|
+ eh = path[depth].p_hdr;
|
|
+ ex = path[depth].p_ext;
|
|
+ if (ex2 != &newex)
|
|
+ ex2 = ex;
|
|
+
|
|
+ err = ext4_ext_get_access(handle, inode, path + depth);
|
|
+ if (err)
|
|
+ goto out;
|
|
+
|
|
allocated = max_blocks;
|
|
|
|
/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
|
|
@@ -2416,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
|
|
ext4_ext_dirty(handle, inode, path + depth);
|
|
/* zero out the first half */
|
|
+ /* blocks available from iblock */
|
|
return allocated;
|
|
}
|
|
}
|
|
@@ -2529,6 +2579,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
|
|
int err = 0, depth, ret;
|
|
unsigned long allocated = 0;
|
|
struct ext4_allocation_request ar;
|
|
+ loff_t disksize;
|
|
|
|
__clear_bit(BH_New, &bh_result->b_state);
|
|
ext_debug("blocks %u/%lu requested for inode %u\n",
|
|
@@ -2616,8 +2667,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
|
|
*/
|
|
if (allocated > max_blocks)
|
|
allocated = max_blocks;
|
|
- /* mark the buffer unwritten */
|
|
- __set_bit(BH_Unwritten, &bh_result->b_state);
|
|
+ set_buffer_unwritten(bh_result);
|
|
goto out2;
|
|
}
|
|
|
|
@@ -2716,14 +2766,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
|
|
goto out2;
|
|
}
|
|
|
|
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
|
|
- EXT4_I(inode)->i_disksize = inode->i_size;
|
|
-
|
|
/* previous routine could use block we allocated */
|
|
newblock = ext_pblock(&newex);
|
|
allocated = ext4_ext_get_actual_len(&newex);
|
|
outnew:
|
|
- __set_bit(BH_New, &bh_result->b_state);
|
|
+ if (extend_disksize) {
|
|
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
|
|
+ if (disksize > i_size_read(inode))
|
|
+ disksize = i_size_read(inode);
|
|
+ if (disksize > EXT4_I(inode)->i_disksize)
|
|
+ EXT4_I(inode)->i_disksize = disksize;
|
|
+ }
|
|
+
|
|
+ set_buffer_new(bh_result);
|
|
|
|
/* Cache only when it is _not_ an uninitialized extent */
|
|
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
|
|
@@ -2733,7 +2788,7 @@ out:
|
|
if (allocated > max_blocks)
|
|
allocated = max_blocks;
|
|
ext4_ext_show_leaf(inode, path);
|
|
- __set_bit(BH_Mapped, &bh_result->b_state);
|
|
+ set_buffer_mapped(bh_result);
|
|
bh_result->b_bdev = inode->i_sb->s_bdev;
|
|
bh_result->b_blocknr = newblock;
|
|
out2:
|
|
@@ -2744,7 +2799,7 @@ out2:
|
|
return err ? err : allocated;
|
|
}
|
|
|
|
-void ext4_ext_truncate(struct inode * inode, struct page *page)
|
|
+void ext4_ext_truncate(struct inode *inode)
|
|
{
|
|
struct address_space *mapping = inode->i_mapping;
|
|
struct super_block *sb = inode->i_sb;
|
|
@@ -2755,33 +2810,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
|
|
/*
|
|
* probably first extent we're gonna free will be last in block
|
|
*/
|
|
- err = ext4_writepage_trans_blocks(inode) + 3;
|
|
+ err = ext4_writepage_trans_blocks(inode);
|
|
handle = ext4_journal_start(inode, err);
|
|
- if (IS_ERR(handle)) {
|
|
- if (page) {
|
|
- clear_highpage(page);
|
|
- flush_dcache_page(page);
|
|
- unlock_page(page);
|
|
- page_cache_release(page);
|
|
- }
|
|
+ if (IS_ERR(handle))
|
|
return;
|
|
- }
|
|
|
|
- if (page)
|
|
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
|
|
+ if (inode->i_size & (sb->s_blocksize - 1))
|
|
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
|
|
+
|
|
+ if (ext4_orphan_add(handle, inode))
|
|
+ goto out_stop;
|
|
|
|
down_write(&EXT4_I(inode)->i_data_sem);
|
|
ext4_ext_invalidate_cache(inode);
|
|
|
|
- ext4_mb_discard_inode_preallocations(inode);
|
|
+ ext4_discard_reservation(inode);
|
|
|
|
/*
|
|
* TODO: optimization is possible here.
|
|
* Probably we need not scan at all,
|
|
* because page truncation is enough.
|
|
*/
|
|
- if (ext4_orphan_add(handle, inode))
|
|
- goto out_stop;
|
|
|
|
/* we have to know where to truncate from in crash case */
|
|
EXT4_I(inode)->i_disksize = inode->i_size;
|
|
@@ -2798,6 +2847,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
|
|
handle->h_sync = 1;
|
|
|
|
out_stop:
|
|
+ up_write(&EXT4_I(inode)->i_data_sem);
|
|
/*
|
|
* If this was a simple ftruncate() and the file will remain alive,
|
|
* then we need to clear up the orphan record which we created above.
|
|
@@ -2808,33 +2858,11 @@ out_stop:
|
|
if (inode->i_nlink)
|
|
ext4_orphan_del(handle, inode);
|
|
|
|
- up_write(&EXT4_I(inode)->i_data_sem);
|
|
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
|
|
ext4_mark_inode_dirty(handle, inode);
|
|
ext4_journal_stop(handle);
|
|
}
|
|
|
|
-/*
|
|
- * ext4_ext_writepage_trans_blocks:
|
|
- * calculate max number of blocks we could modify
|
|
- * in order to allocate new block for an inode
|
|
- */
|
|
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
|
|
-{
|
|
- int needed;
|
|
-
|
|
- needed = ext4_ext_calc_credits_for_insert(inode, NULL);
|
|
-
|
|
- /* caller wants to allocate num blocks, but note it includes sb */
|
|
- needed = needed * num - (num - 1);
|
|
-
|
|
-#ifdef CONFIG_QUOTA
|
|
- needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
|
|
-#endif
|
|
-
|
|
- return needed;
|
|
-}
|
|
-
|
|
static void ext4_falloc_update_inode(struct inode *inode,
|
|
int mode, loff_t new_size, int update_ctime)
|
|
{
|
|
@@ -2895,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
|
|
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
|
|
- block;
|
|
/*
|
|
- * credits to insert 1 extent into extent tree + buffers to be able to
|
|
- * modify 1 super block, 1 block bitmap and 1 group descriptor.
|
|
+ * credits to insert 1 extent into extent tree
|
|
*/
|
|
- credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
|
|
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
|
|
mutex_lock(&inode->i_mutex);
|
|
retry:
|
|
while (ret >= 0 && ret < max_blocks) {
|
|
@@ -2911,7 +2938,7 @@ retry:
|
|
}
|
|
ret = ext4_get_blocks_wrap(handle, inode, block,
|
|
max_blocks, &map_bh,
|
|
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
|
|
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
|
|
if (ret <= 0) {
|
|
#ifdef EXT4FS_DEBUG
|
|
WARN_ON(ret <= 0);
|
|
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
|
|
index 4159be6..430eb79 100644
|
|
--- a/fs/ext4/file.c
|
|
+++ b/fs/ext4/file.c
|
|
@@ -123,6 +123,23 @@ force_commit:
|
|
return ret;
|
|
}
|
|
|
|
+static struct vm_operations_struct ext4_file_vm_ops = {
|
|
+ .fault = filemap_fault,
|
|
+ .page_mkwrite = ext4_page_mkwrite,
|
|
+};
|
|
+
|
|
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
|
|
+{
|
|
+ struct address_space *mapping = file->f_mapping;
|
|
+
|
|
+ if (!mapping->a_ops->readpage)
|
|
+ return -ENOEXEC;
|
|
+ file_accessed(file);
|
|
+ vma->vm_ops = &ext4_file_vm_ops;
|
|
+ vma->vm_flags |= VM_CAN_NONLINEAR;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
const struct file_operations ext4_file_operations = {
|
|
.llseek = generic_file_llseek,
|
|
.read = do_sync_read,
|
|
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_ioctl = ext4_compat_ioctl,
|
|
#endif
|
|
- .mmap = generic_file_mmap,
|
|
+ .mmap = ext4_file_mmap,
|
|
.open = generic_file_open,
|
|
.release = ext4_release_file,
|
|
.fsync = ext4_sync_file,
|
|
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
|
|
const struct inode_operations ext4_file_inode_operations = {
|
|
.truncate = ext4_truncate,
|
|
.setattr = ext4_setattr,
|
|
+ .getattr = ext4_getattr,
|
|
#ifdef CONFIG_EXT4DEV_FS_XATTR
|
|
.setxattr = generic_setxattr,
|
|
.getxattr = generic_getxattr,
|
|
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
|
|
index 1c8ba48..a45c373 100644
|
|
--- a/fs/ext4/fsync.c
|
|
+++ b/fs/ext4/fsync.c
|
|
@@ -27,6 +27,7 @@
|
|
#include <linux/sched.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/jbd2.h>
|
|
+#include <linux/blkdev.h>
|
|
#include "ext4.h"
|
|
#include "ext4_jbd2.h"
|
|
|
|
@@ -45,6 +46,7 @@
|
|
int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
|
|
{
|
|
struct inode *inode = dentry->d_inode;
|
|
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
|
|
int ret = 0;
|
|
|
|
J_ASSERT(ext4_journal_current_handle() == NULL);
|
|
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
|
|
.nr_to_write = 0, /* sys_fsync did this */
|
|
};
|
|
ret = sync_inode(inode, &wbc);
|
|
+ if (journal && (journal->j_flags & JBD2_BARRIER))
|
|
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
|
|
}
|
|
out:
|
|
return ret;
|
|
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
|
|
index 7eb0604..c2c0a8d 100644
|
|
--- a/fs/ext4/group.h
|
|
+++ b/fs/ext4/group.h
|
|
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
|
|
struct ext4_group_desc *gdp);
|
|
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
|
|
struct ext4_group_desc *gdp);
|
|
-struct buffer_head *read_block_bitmap(struct super_block *sb,
|
|
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
|
|
ext4_group_t block_group);
|
|
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
|
|
struct buffer_head *bh,
|
|
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
|
|
index c6efbab..f344834 100644
|
|
--- a/fs/ext4/ialloc.c
|
|
+++ b/fs/ext4/ialloc.c
|
|
@@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
|
|
* Return buffer_head of bitmap on success or NULL.
|
|
*/
|
|
static struct buffer_head *
|
|
-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
|
|
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
|
|
{
|
|
struct ext4_group_desc *desc;
|
|
struct buffer_head *bh = NULL;
|
|
+ ext4_fsblk_t bitmap_blk;
|
|
|
|
desc = ext4_get_group_desc(sb, block_group, NULL);
|
|
if (!desc)
|
|
- goto error_out;
|
|
+ return NULL;
|
|
+ bitmap_blk = ext4_inode_bitmap(sb, desc);
|
|
+ bh = sb_getblk(sb, bitmap_blk);
|
|
+ if (unlikely(!bh)) {
|
|
+ ext4_error(sb, __func__,
|
|
+ "Cannot read inode bitmap - "
|
|
+ "block_group = %lu, inode_bitmap = %llu",
|
|
+ block_group, bitmap_blk);
|
|
+ return NULL;
|
|
+ }
|
|
+ if (bh_uptodate_or_lock(bh))
|
|
+ return bh;
|
|
+
|
|
+ spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
|
|
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
|
|
- bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
|
|
- if (!buffer_uptodate(bh)) {
|
|
- lock_buffer(bh);
|
|
- if (!buffer_uptodate(bh)) {
|
|
- ext4_init_inode_bitmap(sb, bh, block_group,
|
|
- desc);
|
|
- set_buffer_uptodate(bh);
|
|
- }
|
|
- unlock_buffer(bh);
|
|
- }
|
|
- } else {
|
|
- bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
|
|
+ ext4_init_inode_bitmap(sb, bh, block_group, desc);
|
|
+ set_buffer_uptodate(bh);
|
|
+ unlock_buffer(bh);
|
|
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
|
|
+ return bh;
|
|
}
|
|
- if (!bh)
|
|
- ext4_error(sb, "read_inode_bitmap",
|
|
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
|
|
+ if (bh_submit_read(bh) < 0) {
|
|
+ put_bh(bh);
|
|
+ ext4_error(sb, __func__,
|
|
"Cannot read inode bitmap - "
|
|
"block_group = %lu, inode_bitmap = %llu",
|
|
- block_group, ext4_inode_bitmap(sb, desc));
|
|
-error_out:
|
|
+ block_group, bitmap_blk);
|
|
+ return NULL;
|
|
+ }
|
|
return bh;
|
|
}
|
|
|
|
@@ -157,6 +167,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
|
|
struct ext4_super_block * es;
|
|
struct ext4_sb_info *sbi;
|
|
int fatal = 0, err;
|
|
+ ext4_group_t flex_group;
|
|
|
|
if (atomic_read(&inode->i_count) > 1) {
|
|
printk ("ext4_free_inode: inode has count=%d\n",
|
|
@@ -199,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
|
|
}
|
|
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
|
|
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
|
|
- bitmap_bh = read_inode_bitmap(sb, block_group);
|
|
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
|
|
if (!bitmap_bh)
|
|
goto error_return;
|
|
|
|
@@ -232,6 +243,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
|
|
if (is_directory)
|
|
percpu_counter_dec(&sbi->s_dirs_counter);
|
|
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ flex_group = ext4_flex_group(sbi, block_group);
|
|
+ spin_lock(sb_bgl_lock(sbi, flex_group));
|
|
+ sbi->s_flex_groups[flex_group].free_inodes++;
|
|
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
|
|
+ }
|
|
}
|
|
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
|
|
err = ext4_journal_dirty_metadata(handle, bh2);
|
|
@@ -286,6 +303,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
|
|
return ret;
|
|
}
|
|
|
|
+#define free_block_ratio 10
|
|
+
|
|
+static int find_group_flex(struct super_block *sb, struct inode *parent,
|
|
+ ext4_group_t *best_group)
|
|
+{
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
+ struct ext4_group_desc *desc;
|
|
+ struct buffer_head *bh;
|
|
+ struct flex_groups *flex_group = sbi->s_flex_groups;
|
|
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
|
|
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
|
|
+ ext4_group_t ngroups = sbi->s_groups_count;
|
|
+ int flex_size = ext4_flex_bg_size(sbi);
|
|
+ ext4_group_t best_flex = parent_fbg_group;
|
|
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
|
|
+ int flexbg_free_blocks;
|
|
+ int flex_freeb_ratio;
|
|
+ ext4_group_t n_fbg_groups;
|
|
+ ext4_group_t i;
|
|
+
|
|
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
|
|
+ sbi->s_log_groups_per_flex;
|
|
+
|
|
+find_close_to_parent:
|
|
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
|
|
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
|
|
+ if (flex_group[best_flex].free_inodes &&
|
|
+ flex_freeb_ratio > free_block_ratio)
|
|
+ goto found_flexbg;
|
|
+
|
|
+ if (best_flex && best_flex == parent_fbg_group) {
|
|
+ best_flex--;
|
|
+ goto find_close_to_parent;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < n_fbg_groups; i++) {
|
|
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
|
|
+ continue;
|
|
+
|
|
+ flexbg_free_blocks = flex_group[i].free_blocks;
|
|
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
|
|
+
|
|
+ if (flex_freeb_ratio > free_block_ratio &&
|
|
+ flex_group[i].free_inodes) {
|
|
+ best_flex = i;
|
|
+ goto found_flexbg;
|
|
+ }
|
|
+
|
|
+ if (flex_group[best_flex].free_inodes == 0 ||
|
|
+ (flex_group[i].free_blocks >
|
|
+ flex_group[best_flex].free_blocks &&
|
|
+ flex_group[i].free_inodes))
|
|
+ best_flex = i;
|
|
+ }
|
|
+
|
|
+ if (!flex_group[best_flex].free_inodes ||
|
|
+ !flex_group[best_flex].free_blocks)
|
|
+ return -1;
|
|
+
|
|
+found_flexbg:
|
|
+ for (i = best_flex * flex_size; i < ngroups &&
|
|
+ i < (best_flex + 1) * flex_size; i++) {
|
|
+ desc = ext4_get_group_desc(sb, i, &bh);
|
|
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
|
|
+ *best_group = i;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return -1;
|
|
+out:
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/*
|
|
* Orlov's allocator for directories.
|
|
*
|
|
@@ -501,6 +592,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
|
|
struct inode *ret;
|
|
ext4_group_t i;
|
|
int free = 0;
|
|
+ ext4_group_t flex_group;
|
|
|
|
/* Cannot create files in a deleted directory */
|
|
if (!dir || !dir->i_nlink)
|
|
@@ -514,6 +606,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
|
|
|
|
sbi = EXT4_SB(sb);
|
|
es = sbi->s_es;
|
|
+
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ ret2 = find_group_flex(sb, dir, &group);
|
|
+ goto got_group;
|
|
+ }
|
|
+
|
|
if (S_ISDIR(mode)) {
|
|
if (test_opt (sb, OLDALLOC))
|
|
ret2 = find_group_dir(sb, dir, &group);
|
|
@@ -522,6 +620,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
|
|
} else
|
|
ret2 = find_group_other(sb, dir, &group);
|
|
|
|
+got_group:
|
|
err = -ENOSPC;
|
|
if (ret2 == -1)
|
|
goto out;
|
|
@@ -534,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
|
|
goto fail;
|
|
|
|
brelse(bitmap_bh);
|
|
- bitmap_bh = read_inode_bitmap(sb, group);
|
|
+ bitmap_bh = ext4_read_inode_bitmap(sb, group);
|
|
if (!bitmap_bh)
|
|
goto fail;
|
|
|
|
@@ -600,7 +699,7 @@ got:
|
|
/* We may have to initialize the block bitmap if it isn't already */
|
|
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
|
|
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
|
|
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
|
|
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
|
|
|
|
BUFFER_TRACE(block_bh, "get block bitmap access");
|
|
err = ext4_journal_get_write_access(handle, block_bh);
|
|
@@ -639,7 +738,7 @@ got:
|
|
|
|
/* When marking the block group with
|
|
* ~EXT4_BG_INODE_UNINIT we don't want to depend
|
|
- * on the value of bg_itable_unsed even though
|
|
+ * on the value of bg_itable_unused even though
|
|
* mke2fs could have initialized the same for us.
|
|
* Instead we calculated the value below
|
|
*/
|
|
@@ -676,6 +775,13 @@ got:
|
|
percpu_counter_inc(&sbi->s_dirs_counter);
|
|
sb->s_dirt = 1;
|
|
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ flex_group = ext4_flex_group(sbi, group);
|
|
+ spin_lock(sb_bgl_lock(sbi, flex_group));
|
|
+ sbi->s_flex_groups[flex_group].free_inodes--;
|
|
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
|
|
+ }
|
|
+
|
|
inode->i_uid = current->fsuid;
|
|
if (test_opt (sb, GRPID))
|
|
inode->i_gid = dir->i_gid;
|
|
@@ -740,14 +846,10 @@ got:
|
|
goto fail_free_drop;
|
|
|
|
if (test_opt(sb, EXTENTS)) {
|
|
- /* set extent flag only for diretory, file and normal symlink*/
|
|
+ /* set extent flag only for directory, file and normal symlink*/
|
|
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
|
|
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
|
|
ext4_ext_tree_init(handle, inode);
|
|
- err = ext4_update_incompat_feature(handle, sb,
|
|
- EXT4_FEATURE_INCOMPAT_EXTENTS);
|
|
- if (err)
|
|
- goto fail_free_drop;
|
|
}
|
|
}
|
|
|
|
@@ -799,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
|
|
|
|
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
|
|
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
|
|
- bitmap_bh = read_inode_bitmap(sb, block_group);
|
|
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
|
|
if (!bitmap_bh) {
|
|
ext4_warning(sb, __func__,
|
|
"inode bitmap error for orphan %lu", ino);
|
|
@@ -817,6 +919,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
|
|
if (IS_ERR(inode))
|
|
goto iget_failed;
|
|
|
|
+ /*
|
|
+ * If the orphans has i_nlinks > 0 then it should be able to be
|
|
+ * truncated, otherwise it won't be removed from the orphan list
|
|
+ * during processing and an infinite loop will result.
|
|
+ */
|
|
+ if (inode->i_nlink && !ext4_can_truncate(inode))
|
|
+ goto bad_orphan;
|
|
+
|
|
if (NEXT_ORPHAN(inode) > max_ino)
|
|
goto bad_orphan;
|
|
brelse(bitmap_bh);
|
|
@@ -838,6 +948,7 @@ bad_orphan:
|
|
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
|
|
NEXT_ORPHAN(inode));
|
|
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
|
|
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
|
|
/* Avoid freeing blocks if we got a bad deleted inode */
|
|
if (inode->i_nlink == 0)
|
|
inode->i_blocks = 0;
|
|
@@ -868,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
|
|
continue;
|
|
desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
|
|
brelse(bitmap_bh);
|
|
- bitmap_bh = read_inode_bitmap(sb, i);
|
|
+ bitmap_bh = ext4_read_inode_bitmap(sb, i);
|
|
if (!bitmap_bh)
|
|
continue;
|
|
|
|
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
|
|
index 8d97077..3c0195a 100644
|
|
--- a/fs/ext4/inode.c
|
|
+++ b/fs/ext4/inode.c
|
|
@@ -32,12 +32,25 @@
|
|
#include <linux/string.h>
|
|
#include <linux/buffer_head.h>
|
|
#include <linux/writeback.h>
|
|
+#include <linux/pagevec.h>
|
|
#include <linux/mpage.h>
|
|
#include <linux/uio.h>
|
|
#include <linux/bio.h>
|
|
#include "ext4_jbd2.h"
|
|
#include "xattr.h"
|
|
#include "acl.h"
|
|
+#include "ext4_extents.h"
|
|
+
|
|
+#define MPAGE_DA_EXTENT_TAIL 0x01
|
|
+
|
|
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
|
|
+ loff_t new_size)
|
|
+{
|
|
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
|
|
+ new_size);
|
|
+}
|
|
+
|
|
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
|
|
|
|
/*
|
|
* Test whether an inode is a fast symlink.
|
|
@@ -180,14 +193,18 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
|
|
void ext4_delete_inode (struct inode * inode)
|
|
{
|
|
handle_t *handle;
|
|
+ int err;
|
|
|
|
+ if (ext4_should_order_data(inode))
|
|
+ ext4_begin_ordered_truncate(inode, 0);
|
|
truncate_inode_pages(&inode->i_data, 0);
|
|
|
|
if (is_bad_inode(inode))
|
|
goto no_delete;
|
|
|
|
- handle = start_transaction(inode);
|
|
+ handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
|
|
if (IS_ERR(handle)) {
|
|
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
|
|
/*
|
|
* If we're going to skip the normal cleanup, we still need to
|
|
* make sure that the in-core orphan linked list is properly
|
|
@@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
|
|
if (IS_SYNC(inode))
|
|
handle->h_sync = 1;
|
|
inode->i_size = 0;
|
|
+ err = ext4_mark_inode_dirty(handle, inode);
|
|
+ if (err) {
|
|
+ ext4_warning(inode->i_sb, __func__,
|
|
+ "couldn't mark inode dirty (err %d)", err);
|
|
+ goto stop_handle;
|
|
+ }
|
|
if (inode->i_blocks)
|
|
ext4_truncate(inode);
|
|
+
|
|
+ /*
|
|
+ * ext4_ext_truncate() doesn't reserve any slop when it
|
|
+ * restarts journal transactions; therefore there may not be
|
|
+ * enough credits left in the handle to remove the inode from
|
|
+ * the orphan list and set the dtime field.
|
|
+ */
|
|
+ if (handle->h_buffer_credits < 3) {
|
|
+ err = ext4_journal_extend(handle, 3);
|
|
+ if (err > 0)
|
|
+ err = ext4_journal_restart(handle, 3);
|
|
+ if (err != 0) {
|
|
+ ext4_warning(inode->i_sb, __func__,
|
|
+ "couldn't extend journal (err %d)", err);
|
|
+ stop_handle:
|
|
+ ext4_journal_stop(handle);
|
|
+ goto no_delete;
|
|
+ }
|
|
+ }
|
|
+
|
|
/*
|
|
* Kill off the orphan record which ext4_truncate created.
|
|
* AKPM: I think this can be inside the above `if'.
|
|
@@ -508,11 +551,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
|
|
* direct blocks
|
|
*/
|
|
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
|
|
- ext4_fsblk_t goal, int indirect_blks, int blks,
|
|
- ext4_fsblk_t new_blocks[4], int *err)
|
|
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
|
|
+ int indirect_blks, int blks,
|
|
+ ext4_fsblk_t new_blocks[4], int *err)
|
|
{
|
|
int target, i;
|
|
- unsigned long count = 0;
|
|
+ unsigned long count = 0, blk_allocated = 0;
|
|
int index = 0;
|
|
ext4_fsblk_t current_block = 0;
|
|
int ret = 0;
|
|
@@ -525,12 +569,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
|
|
* the first direct block of this branch. That's the
|
|
* minimum number of blocks need to allocate(required)
|
|
*/
|
|
- target = blks + indirect_blks;
|
|
-
|
|
- while (1) {
|
|
+ /* first we try to allocate the indirect blocks */
|
|
+ target = indirect_blks;
|
|
+ while (target > 0) {
|
|
count = target;
|
|
/* allocating blocks for indirect blocks and direct blocks */
|
|
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
|
|
+ current_block = ext4_new_meta_blocks(handle, inode,
|
|
+ goal, &count, err);
|
|
if (*err)
|
|
goto failed_out;
|
|
|
|
@@ -540,16 +585,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
|
|
new_blocks[index++] = current_block++;
|
|
count--;
|
|
}
|
|
-
|
|
- if (count > 0)
|
|
+ if (count > 0) {
|
|
+ /*
|
|
+ * save the new block number
|
|
+ * for the first direct block
|
|
+ */
|
|
+ new_blocks[index] = current_block;
|
|
+ printk(KERN_INFO "%s returned more blocks than "
|
|
+ "requested\n", __func__);
|
|
+ WARN_ON(1);
|
|
break;
|
|
+ }
|
|
}
|
|
|
|
- /* save the new block number for the first direct block */
|
|
- new_blocks[index] = current_block;
|
|
-
|
|
+ target = blks - count ;
|
|
+ blk_allocated = count;
|
|
+ if (!target)
|
|
+ goto allocated;
|
|
+ /* Now allocate data blocks */
|
|
+ count = target;
|
|
+ /* allocating blocks for data blocks */
|
|
+ current_block = ext4_new_blocks(handle, inode, iblock,
|
|
+ goal, &count, err);
|
|
+ if (*err && (target == blks)) {
|
|
+ /*
|
|
+ * if the allocation failed and we didn't allocate
|
|
+ * any blocks before
|
|
+ */
|
|
+ goto failed_out;
|
|
+ }
|
|
+ if (!*err) {
|
|
+ if (target == blks) {
|
|
+ /*
|
|
+ * save the new block number
|
|
+ * for the first direct block
|
|
+ */
|
|
+ new_blocks[index] = current_block;
|
|
+ }
|
|
+ blk_allocated += count;
|
|
+ }
|
|
+allocated:
|
|
/* total number of blocks allocated for direct blocks */
|
|
- ret = count;
|
|
+ ret = blk_allocated;
|
|
*err = 0;
|
|
return ret;
|
|
failed_out:
|
|
@@ -584,8 +661,9 @@ failed_out:
|
|
* as described above and return 0.
|
|
*/
|
|
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
|
|
- int indirect_blks, int *blks, ext4_fsblk_t goal,
|
|
- ext4_lblk_t *offsets, Indirect *branch)
|
|
+ ext4_lblk_t iblock, int indirect_blks,
|
|
+ int *blks, ext4_fsblk_t goal,
|
|
+ ext4_lblk_t *offsets, Indirect *branch)
|
|
{
|
|
int blocksize = inode->i_sb->s_blocksize;
|
|
int i, n = 0;
|
|
@@ -595,7 +673,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
|
|
ext4_fsblk_t new_blocks[4];
|
|
ext4_fsblk_t current_block;
|
|
|
|
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
|
|
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
|
|
*blks, new_blocks, &err);
|
|
if (err)
|
|
return err;
|
|
@@ -799,6 +877,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
|
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
|
int count = 0;
|
|
ext4_fsblk_t first_block = 0;
|
|
+ loff_t disksize;
|
|
|
|
|
|
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
|
|
@@ -855,8 +934,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
|
|
/*
|
|
* Block out ext4_truncate while we alter the tree
|
|
*/
|
|
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
|
|
- offsets + (partial - chain), partial);
|
|
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
|
|
+ &count, goal,
|
|
+ offsets + (partial - chain), partial);
|
|
|
|
/*
|
|
* The ext4_splice_branch call will free and forget any buffers
|
|
@@ -873,8 +953,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
|
|
* protect it if you're about to implement concurrent
|
|
* ext4_get_block() -bzzz
|
|
*/
|
|
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
|
|
- ei->i_disksize = inode->i_size;
|
|
+ if (!err && extend_disksize) {
|
|
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
|
|
+ if (disksize > i_size_read(inode))
|
|
+ disksize = i_size_read(inode);
|
|
+ if (disksize > ei->i_disksize)
|
|
+ ei->i_disksize = disksize;
|
|
+ }
|
|
if (err)
|
|
goto cleanup;
|
|
|
|
@@ -897,23 +982,74 @@ out:
|
|
return err;
|
|
}
|
|
|
|
-/* Maximum number of blocks we map for direct IO at once. */
|
|
-#define DIO_MAX_BLOCKS 4096
|
|
/*
|
|
- * Number of credits we need for writing DIO_MAX_BLOCKS:
|
|
- * We need sb + group descriptor + bitmap + inode -> 4
|
|
- * For B blocks with A block pointers per block we need:
|
|
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
|
|
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
|
|
+ * Calculate the number of metadata blocks need to reserve
|
|
+ * to allocate @blocks for non extent file based file
|
|
*/
|
|
-#define DIO_CREDITS 25
|
|
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
|
|
+{
|
|
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
|
|
+ int ind_blks, dind_blks, tind_blks;
|
|
+
|
|
+ /* number of new indirect blocks needed */
|
|
+ ind_blks = (blocks + icap - 1) / icap;
|
|
+
|
|
+ dind_blks = (ind_blks + icap - 1) / icap;
|
|
|
|
+ tind_blks = 1;
|
|
+
|
|
+ return ind_blks + dind_blks + tind_blks;
|
|
+}
|
|
|
|
/*
|
|
+ * Calculate the number of metadata blocks need to reserve
|
|
+ * to allocate given number of blocks
|
|
+ */
|
|
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
|
|
+{
|
|
+ if (!blocks)
|
|
+ return 0;
|
|
+
|
|
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
|
|
+ return ext4_ext_calc_metadata_amount(inode, blocks);
|
|
+
|
|
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
|
|
+}
|
|
+
|
|
+static void ext4_da_update_reserve_space(struct inode *inode, int used)
|
|
+{
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
+ int total, mdb, mdb_free;
|
|
+
|
|
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ /* recalculate the number of metablocks still need to be reserved */
|
|
+ total = EXT4_I(inode)->i_reserved_data_blocks - used;
|
|
+ mdb = ext4_calc_metadata_amount(inode, total);
|
|
+
|
|
+ /* figure out how many metablocks to release */
|
|
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
|
|
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
|
|
+
|
|
+ /* Account for allocated meta_blocks */
|
|
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
|
|
+
|
|
+ /* update fs free blocks counter for truncate case */
|
|
+ percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
|
|
+
|
|
+ /* update per-inode reservations */
|
|
+ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
|
|
+ EXT4_I(inode)->i_reserved_data_blocks -= used;
|
|
+
|
|
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
|
|
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
|
|
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The ext4_get_blocks_wrap() function try to look up the requested blocks,
|
|
+ * and returns if the blocks are already mapped.
|
|
*
|
|
- *
|
|
- * ext4_ext4 get_block() wrapper function
|
|
- * It will do a look up first, and returns if the blocks already mapped.
|
|
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
|
|
* and store the allocated blocks in the result buffer head and mark it
|
|
* mapped.
|
|
@@ -934,7 +1070,7 @@ out:
|
|
*/
|
|
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
|
|
unsigned long max_blocks, struct buffer_head *bh,
|
|
- int create, int extend_disksize)
|
|
+ int create, int extend_disksize, int flag)
|
|
{
|
|
int retval;
|
|
|
|
@@ -975,6 +1111,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
|
|
* with create == 1 flag.
|
|
*/
|
|
down_write((&EXT4_I(inode)->i_data_sem));
|
|
+
|
|
+ /*
|
|
+ * if the caller is from delayed allocation writeout path
|
|
+ * we have already reserved fs blocks for allocation
|
|
+ * let the underlying get_block() function know to
|
|
+ * avoid double accounting
|
|
+ */
|
|
+ if (flag)
|
|
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
|
|
/*
|
|
* We need to check for EXT4 here because migrate
|
|
* could have changed the inode type in between
|
|
@@ -996,23 +1141,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
|
|
~EXT4_EXT_MIGRATE;
|
|
}
|
|
}
|
|
+
|
|
+ if (flag) {
|
|
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
|
|
+ /*
|
|
+ * Update reserved blocks/metadata blocks
|
|
+ * after successful block allocation
|
|
+ * which were deferred till now
|
|
+ */
|
|
+ if ((retval > 0) && buffer_delay(bh))
|
|
+ ext4_da_update_reserve_space(inode, retval);
|
|
+ }
|
|
+
|
|
up_write((&EXT4_I(inode)->i_data_sem));
|
|
return retval;
|
|
}
|
|
|
|
+/* Maximum number of blocks we map for direct IO at once. */
|
|
+#define DIO_MAX_BLOCKS 4096
|
|
+
|
|
static int ext4_get_block(struct inode *inode, sector_t iblock,
|
|
struct buffer_head *bh_result, int create)
|
|
{
|
|
handle_t *handle = ext4_journal_current_handle();
|
|
int ret = 0, started = 0;
|
|
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
|
|
+ int dio_credits;
|
|
|
|
if (create && !handle) {
|
|
/* Direct IO write... */
|
|
if (max_blocks > DIO_MAX_BLOCKS)
|
|
max_blocks = DIO_MAX_BLOCKS;
|
|
- handle = ext4_journal_start(inode, DIO_CREDITS +
|
|
- 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
|
|
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
|
|
+ handle = ext4_journal_start(inode, dio_credits);
|
|
if (IS_ERR(handle)) {
|
|
ret = PTR_ERR(handle);
|
|
goto out;
|
|
@@ -1021,7 +1182,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
|
|
}
|
|
|
|
ret = ext4_get_blocks_wrap(handle, inode, iblock,
|
|
- max_blocks, bh_result, create, 0);
|
|
+ max_blocks, bh_result, create, 0, 0);
|
|
if (ret > 0) {
|
|
bh_result->b_size = (ret << inode->i_blkbits);
|
|
ret = 0;
|
|
@@ -1047,7 +1208,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
|
|
dummy.b_blocknr = -1000;
|
|
buffer_trace_init(&dummy.b_history);
|
|
err = ext4_get_blocks_wrap(handle, inode, block, 1,
|
|
- &dummy, create, 1);
|
|
+ &dummy, create, 1, 0);
|
|
/*
|
|
* ext4_get_blocks_handle() returns number of blocks
|
|
* mapped. 0 in case of a HOLE.
|
|
@@ -1203,19 +1364,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
|
|
to = from + len;
|
|
|
|
retry:
|
|
- page = __grab_cache_page(mapping, index);
|
|
- if (!page)
|
|
- return -ENOMEM;
|
|
- *pagep = page;
|
|
-
|
|
handle = ext4_journal_start(inode, needed_blocks);
|
|
if (IS_ERR(handle)) {
|
|
- unlock_page(page);
|
|
- page_cache_release(page);
|
|
ret = PTR_ERR(handle);
|
|
goto out;
|
|
}
|
|
|
|
+ page = __grab_cache_page(mapping, index);
|
|
+ if (!page) {
|
|
+ ext4_journal_stop(handle);
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ *pagep = page;
|
|
+
|
|
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
|
|
ext4_get_block);
|
|
|
|
@@ -1225,8 +1387,8 @@ retry:
|
|
}
|
|
|
|
if (ret) {
|
|
- ext4_journal_stop(handle);
|
|
unlock_page(page);
|
|
+ ext4_journal_stop(handle);
|
|
page_cache_release(page);
|
|
}
|
|
|
|
@@ -1236,15 +1398,6 @@ out:
|
|
return ret;
|
|
}
|
|
|
|
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
|
|
-{
|
|
- int err = jbd2_journal_dirty_data(handle, bh);
|
|
- if (err)
|
|
- ext4_journal_abort_handle(__func__, __func__,
|
|
- bh, handle, err);
|
|
- return err;
|
|
-}
|
|
-
|
|
/* For write_end() in data=journal mode */
|
|
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
|
|
{
|
|
@@ -1255,29 +1408,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
|
|
}
|
|
|
|
/*
|
|
- * Generic write_end handler for ordered and writeback ext4 journal modes.
|
|
- * We can't use generic_write_end, because that unlocks the page and we need to
|
|
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
|
|
- * after block_write_end.
|
|
- */
|
|
-static int ext4_generic_write_end(struct file *file,
|
|
- struct address_space *mapping,
|
|
- loff_t pos, unsigned len, unsigned copied,
|
|
- struct page *page, void *fsdata)
|
|
-{
|
|
- struct inode *inode = file->f_mapping->host;
|
|
-
|
|
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
|
|
-
|
|
- if (pos+copied > inode->i_size) {
|
|
- i_size_write(inode, pos+copied);
|
|
- mark_inode_dirty(inode);
|
|
- }
|
|
-
|
|
- return copied;
|
|
-}
|
|
-
|
|
-/*
|
|
* We need to pick up the new inode size which generic_commit_write gave us
|
|
* `file' can be NULL - eg, when called from page_symlink().
|
|
*
|
|
@@ -1290,15 +1420,10 @@ static int ext4_ordered_write_end(struct file *file,
|
|
struct page *page, void *fsdata)
|
|
{
|
|
handle_t *handle = ext4_journal_current_handle();
|
|
- struct inode *inode = file->f_mapping->host;
|
|
- unsigned from, to;
|
|
+ struct inode *inode = mapping->host;
|
|
int ret = 0, ret2;
|
|
|
|
- from = pos & (PAGE_CACHE_SIZE - 1);
|
|
- to = from + len;
|
|
-
|
|
- ret = walk_page_buffers(handle, page_buffers(page),
|
|
- from, to, NULL, ext4_journal_dirty_data);
|
|
+ ret = ext4_jbd2_file_inode(handle, inode);
|
|
|
|
if (ret == 0) {
|
|
/*
|
|
@@ -1311,7 +1436,7 @@ static int ext4_ordered_write_end(struct file *file,
|
|
new_i_size = pos + copied;
|
|
if (new_i_size > EXT4_I(inode)->i_disksize)
|
|
EXT4_I(inode)->i_disksize = new_i_size;
|
|
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
|
|
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
|
|
page, fsdata);
|
|
copied = ret2;
|
|
if (ret2 < 0)
|
|
@@ -1320,8 +1445,6 @@ static int ext4_ordered_write_end(struct file *file,
|
|
ret2 = ext4_journal_stop(handle);
|
|
if (!ret)
|
|
ret = ret2;
|
|
- unlock_page(page);
|
|
- page_cache_release(page);
|
|
|
|
return ret ? ret : copied;
|
|
}
|
|
@@ -1332,7 +1455,7 @@ static int ext4_writeback_write_end(struct file *file,
|
|
struct page *page, void *fsdata)
|
|
{
|
|
handle_t *handle = ext4_journal_current_handle();
|
|
- struct inode *inode = file->f_mapping->host;
|
|
+ struct inode *inode = mapping->host;
|
|
int ret = 0, ret2;
|
|
loff_t new_i_size;
|
|
|
|
@@ -1340,7 +1463,7 @@ static int ext4_writeback_write_end(struct file *file,
|
|
if (new_i_size > EXT4_I(inode)->i_disksize)
|
|
EXT4_I(inode)->i_disksize = new_i_size;
|
|
|
|
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
|
|
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
|
|
page, fsdata);
|
|
copied = ret2;
|
|
if (ret2 < 0)
|
|
@@ -1349,8 +1472,6 @@ static int ext4_writeback_write_end(struct file *file,
|
|
ret2 = ext4_journal_stop(handle);
|
|
if (!ret)
|
|
ret = ret2;
|
|
- unlock_page(page);
|
|
- page_cache_release(page);
|
|
|
|
return ret ? ret : copied;
|
|
}
|
|
@@ -1389,15 +1510,1028 @@ static int ext4_journalled_write_end(struct file *file,
|
|
ret = ret2;
|
|
}
|
|
|
|
+ unlock_page(page);
|
|
ret2 = ext4_journal_stop(handle);
|
|
if (!ret)
|
|
ret = ret2;
|
|
- unlock_page(page);
|
|
page_cache_release(page);
|
|
|
|
return ret ? ret : copied;
|
|
}
|
|
|
|
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
|
|
+{
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
+ unsigned long md_needed, mdblocks, total = 0;
|
|
+
|
|
+ /*
|
|
+ * recalculate the amount of metadata blocks to reserve
|
|
+ * in order to allocate nrblocks
|
|
+ * worse case is one extent per block
|
|
+ */
|
|
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
|
|
+ mdblocks = ext4_calc_metadata_amount(inode, total);
|
|
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
|
|
+
|
|
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
|
|
+ total = md_needed + nrblocks;
|
|
+
|
|
+ if (ext4_has_free_blocks(sbi, total) < total) {
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ return -ENOSPC;
|
|
+ }
|
|
+ /* reduce fs free blocks counter */
|
|
+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
|
|
+
|
|
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
|
|
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
|
|
+
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ return 0; /* success */
|
|
+}
|
|
+
|
|
+static void ext4_da_release_space(struct inode *inode, int to_free)
|
|
+{
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
+ int total, mdb, mdb_free, release;
|
|
+
|
|
+ if (!to_free)
|
|
+ return; /* Nothing to release, exit */
|
|
+
|
|
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+
|
|
+ if (!EXT4_I(inode)->i_reserved_data_blocks) {
|
|
+ /*
|
|
+ * if there is no reserved blocks, but we try to free some
|
|
+ * then the counter is messed up somewhere.
|
|
+ * but since this function is called from invalidate
|
|
+ * page, it's harmless to return without any action
|
|
+ */
|
|
+ printk(KERN_INFO "ext4 delalloc try to release %d reserved "
|
|
+ "blocks for inode %lu, but there is no reserved "
|
|
+ "data blocks\n", to_free, inode->i_ino);
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* recalculate the number of metablocks still need to be reserved */
|
|
+ total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
|
|
+ mdb = ext4_calc_metadata_amount(inode, total);
|
|
+
|
|
+ /* figure out how many metablocks to release */
|
|
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
|
|
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
|
|
+
|
|
+ release = to_free + mdb_free;
|
|
+
|
|
+ /* update fs free blocks counter for truncate case */
|
|
+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
|
|
+
|
|
+ /* update per-inode reservations */
|
|
+ BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
|
|
+ EXT4_I(inode)->i_reserved_data_blocks -= to_free;
|
|
+
|
|
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
|
|
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+}
|
|
+
|
|
+static void ext4_da_page_release_reservation(struct page *page,
|
|
+ unsigned long offset)
|
|
+{
|
|
+ int to_release = 0;
|
|
+ struct buffer_head *head, *bh;
|
|
+ unsigned int curr_off = 0;
|
|
+
|
|
+ head = page_buffers(page);
|
|
+ bh = head;
|
|
+ do {
|
|
+ unsigned int next_off = curr_off + bh->b_size;
|
|
+
|
|
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
|
|
+ to_release++;
|
|
+ clear_buffer_delay(bh);
|
|
+ }
|
|
+ curr_off = next_off;
|
|
+ } while ((bh = bh->b_this_page) != head);
|
|
+ ext4_da_release_space(page->mapping->host, to_release);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Delayed allocation stuff
|
|
+ */
|
|
+
|
|
+struct mpage_da_data {
|
|
+ struct inode *inode;
|
|
+ struct buffer_head lbh; /* extent of blocks */
|
|
+ unsigned long first_page, next_page; /* extent of pages */
|
|
+ get_block_t *get_block;
|
|
+ struct writeback_control *wbc;
|
|
+ int io_done;
|
|
+ long pages_written;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * mpage_da_submit_io - walks through extent of pages and try to write
|
|
+ * them with writepage() call back
|
|
+ *
|
|
+ * @mpd->inode: inode
|
|
+ * @mpd->first_page: first page of the extent
|
|
+ * @mpd->next_page: page after the last page of the extent
|
|
+ * @mpd->get_block: the filesystem's block mapper function
|
|
+ *
|
|
+ * By the time mpage_da_submit_io() is called we expect all blocks
|
|
+ * to be allocated. this may be wrong if allocation failed.
|
|
+ *
|
|
+ * As pages are already locked by write_cache_pages(), we can't use it
|
|
+ */
|
|
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
|
|
+{
|
|
+ struct address_space *mapping = mpd->inode->i_mapping;
|
|
+ int ret = 0, err, nr_pages, i;
|
|
+ unsigned long index, end;
|
|
+ struct pagevec pvec;
|
|
+
|
|
+ BUG_ON(mpd->next_page <= mpd->first_page);
|
|
+ pagevec_init(&pvec, 0);
|
|
+ index = mpd->first_page;
|
|
+ end = mpd->next_page - 1;
|
|
+
|
|
+ while (index <= end) {
|
|
+ /* XXX: optimize tail */
|
|
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
|
|
+ if (nr_pages == 0)
|
|
+ break;
|
|
+ for (i = 0; i < nr_pages; i++) {
|
|
+ struct page *page = pvec.pages[i];
|
|
+
|
|
+ index = page->index;
|
|
+ if (index > end)
|
|
+ break;
|
|
+ index++;
|
|
+
|
|
+ err = mapping->a_ops->writepage(page, mpd->wbc);
|
|
+ if (!err)
|
|
+ mpd->pages_written++;
|
|
+ /*
|
|
+ * In error case, we have to continue because
|
|
+ * remaining pages are still locked
|
|
+ * XXX: unlock and re-dirty them?
|
|
+ */
|
|
+ if (ret == 0)
|
|
+ ret = err;
|
|
+ }
|
|
+ pagevec_release(&pvec);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
|
|
+ *
|
|
+ * @mpd->inode - inode to walk through
|
|
+ * @exbh->b_blocknr - first block on a disk
|
|
+ * @exbh->b_size - amount of space in bytes
|
|
+ * @logical - first logical block to start assignment with
|
|
+ *
|
|
+ * the function goes through all passed space and put actual disk
|
|
+ * block numbers into buffer heads, dropping BH_Delay
|
|
+ */
|
|
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
|
|
+ struct buffer_head *exbh)
|
|
+{
|
|
+ struct inode *inode = mpd->inode;
|
|
+ struct address_space *mapping = inode->i_mapping;
|
|
+ int blocks = exbh->b_size >> inode->i_blkbits;
|
|
+ sector_t pblock = exbh->b_blocknr, cur_logical;
|
|
+ struct buffer_head *head, *bh;
|
|
+ pgoff_t index, end;
|
|
+ struct pagevec pvec;
|
|
+ int nr_pages, i;
|
|
+
|
|
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
|
|
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
|
|
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
|
|
+
|
|
+ pagevec_init(&pvec, 0);
|
|
+
|
|
+ while (index <= end) {
|
|
+ /* XXX: optimize tail */
|
|
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
|
|
+ if (nr_pages == 0)
|
|
+ break;
|
|
+ for (i = 0; i < nr_pages; i++) {
|
|
+ struct page *page = pvec.pages[i];
|
|
+
|
|
+ index = page->index;
|
|
+ if (index > end)
|
|
+ break;
|
|
+ index++;
|
|
+
|
|
+ BUG_ON(!PageLocked(page));
|
|
+ BUG_ON(PageWriteback(page));
|
|
+ BUG_ON(!page_has_buffers(page));
|
|
+
|
|
+ bh = page_buffers(page);
|
|
+ head = bh;
|
|
+
|
|
+ /* skip blocks out of the range */
|
|
+ do {
|
|
+ if (cur_logical >= logical)
|
|
+ break;
|
|
+ cur_logical++;
|
|
+ } while ((bh = bh->b_this_page) != head);
|
|
+
|
|
+ do {
|
|
+ if (cur_logical >= logical + blocks)
|
|
+ break;
|
|
+ if (buffer_delay(bh)) {
|
|
+ bh->b_blocknr = pblock;
|
|
+ clear_buffer_delay(bh);
|
|
+ bh->b_bdev = inode->i_sb->s_bdev;
|
|
+ } else if (buffer_unwritten(bh)) {
|
|
+ bh->b_blocknr = pblock;
|
|
+ clear_buffer_unwritten(bh);
|
|
+ set_buffer_mapped(bh);
|
|
+ set_buffer_new(bh);
|
|
+ bh->b_bdev = inode->i_sb->s_bdev;
|
|
+ } else if (buffer_mapped(bh))
|
|
+ BUG_ON(bh->b_blocknr != pblock);
|
|
+
|
|
+ cur_logical++;
|
|
+ pblock++;
|
|
+ } while ((bh = bh->b_this_page) != head);
|
|
+ }
|
|
+ pagevec_release(&pvec);
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ * __unmap_underlying_blocks - just a helper function to unmap
|
|
+ * set of blocks described by @bh
|
|
+ */
|
|
+static inline void __unmap_underlying_blocks(struct inode *inode,
|
|
+ struct buffer_head *bh)
|
|
+{
|
|
+ struct block_device *bdev = inode->i_sb->s_bdev;
|
|
+ int blocks, i;
|
|
+
|
|
+ blocks = bh->b_size >> inode->i_blkbits;
|
|
+ for (i = 0; i < blocks; i++)
|
|
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * mpage_da_map_blocks - go through given space
|
|
+ *
|
|
+ * @mpd->lbh - bh describing space
|
|
+ * @mpd->get_block - the filesystem's block mapper function
|
|
+ *
|
|
+ * The function skips space we know is already mapped to disk blocks.
|
|
+ *
|
|
+ */
|
|
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
|
|
+{
|
|
+ int err = 0;
|
|
+ struct buffer_head *lbh = &mpd->lbh;
|
|
+ sector_t next = lbh->b_blocknr;
|
|
+ struct buffer_head new;
|
|
+
|
|
+ /*
|
|
+ * We consider only non-mapped and non-allocated blocks
|
|
+ */
|
|
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
|
|
+ return;
|
|
+
|
|
+ new.b_state = lbh->b_state;
|
|
+ new.b_blocknr = 0;
|
|
+ new.b_size = lbh->b_size;
|
|
+
|
|
+ /*
|
|
+ * If we didn't accumulate anything
|
|
+ * to write simply return
|
|
+ */
|
|
+ if (!new.b_size)
|
|
+ return;
|
|
+ err = mpd->get_block(mpd->inode, next, &new, 1);
|
|
+ if (err)
|
|
+ return;
|
|
+ BUG_ON(new.b_size == 0);
|
|
+
|
|
+ if (buffer_new(&new))
|
|
+ __unmap_underlying_blocks(mpd->inode, &new);
|
|
+
|
|
+ /*
|
|
+ * If blocks are delayed marked, we need to
|
|
+ * put actual blocknr and drop delayed bit
|
|
+ */
|
|
+ if (buffer_delay(lbh) || buffer_unwritten(lbh))
|
|
+ mpage_put_bnr_to_bhs(mpd, next, &new);
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
|
|
+ (1 << BH_Delay) | (1 << BH_Unwritten))
|
|
+
|
|
+/*
|
|
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
|
|
+ *
|
|
+ * @mpd->lbh - extent of blocks
|
|
+ * @logical - logical number of the block in the file
|
|
+ * @bh - bh of the block (used to access block's state)
|
|
+ *
|
|
+ * the function is used to collect contig. blocks in same state
|
|
+ */
|
|
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
|
|
+ sector_t logical, struct buffer_head *bh)
|
|
+{
|
|
+ sector_t next;
|
|
+ size_t b_size = bh->b_size;
|
|
+ struct buffer_head *lbh = &mpd->lbh;
|
|
+ int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
|
|
+
|
|
+ /* check if thereserved journal credits might overflow */
|
|
+ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
|
|
+ if (nrblocks >= EXT4_MAX_TRANS_DATA) {
|
|
+ /*
|
|
+ * With non-extent format we are limited by the journal
|
|
+ * credit available. Total credit needed to insert
|
|
+ * nrblocks contiguous blocks is dependent on the
|
|
+ * nrblocks. So limit nrblocks.
|
|
+ */
|
|
+ goto flush_it;
|
|
+ } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
|
|
+ EXT4_MAX_TRANS_DATA) {
|
|
+ /*
|
|
+ * Adding the new buffer_head would make it cross the
|
|
+ * allowed limit for which we have journal credit
|
|
+ * reserved. So limit the new bh->b_size
|
|
+ */
|
|
+ b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
|
|
+ mpd->inode->i_blkbits;
|
|
+ /* we will do mpage_da_submit_io in the next loop */
|
|
+ }
|
|
+ }
|
|
+ /*
|
|
+ * First block in the extent
|
|
+ */
|
|
+ if (lbh->b_size == 0) {
|
|
+ lbh->b_blocknr = logical;
|
|
+ lbh->b_size = b_size;
|
|
+ lbh->b_state = bh->b_state & BH_FLAGS;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ next = lbh->b_blocknr + nrblocks;
|
|
+ /*
|
|
+ * Can we merge the block to our big extent?
|
|
+ */
|
|
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
|
|
+ lbh->b_size += b_size;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+flush_it:
|
|
+ /*
|
|
+ * We couldn't merge the block to our extent, so we
|
|
+ * need to flush current extent and start new one
|
|
+ */
|
|
+ mpage_da_map_blocks(mpd);
|
|
+ mpage_da_submit_io(mpd);
|
|
+ mpd->io_done = 1;
|
|
+ return;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * __mpage_da_writepage - finds extent of pages and blocks
|
|
+ *
|
|
+ * @page: page to consider
|
|
+ * @wbc: not used, we just follow rules
|
|
+ * @data: context
|
|
+ *
|
|
+ * The function finds extents of pages and scan them for all blocks.
|
|
+ */
|
|
+static int __mpage_da_writepage(struct page *page,
|
|
+ struct writeback_control *wbc, void *data)
|
|
+{
|
|
+ struct mpage_da_data *mpd = data;
|
|
+ struct inode *inode = mpd->inode;
|
|
+ struct buffer_head *bh, *head, fake;
|
|
+ sector_t logical;
|
|
+
|
|
+ if (mpd->io_done) {
|
|
+ /*
|
|
+ * Rest of the page in the page_vec
|
|
+ * redirty then and skip then. We will
|
|
+ * try to to write them again after
|
|
+ * starting a new transaction
|
|
+ */
|
|
+ redirty_page_for_writepage(wbc, page);
|
|
+ unlock_page(page);
|
|
+ return MPAGE_DA_EXTENT_TAIL;
|
|
+ }
|
|
+ /*
|
|
+ * Can we merge this page to current extent?
|
|
+ */
|
|
+ if (mpd->next_page != page->index) {
|
|
+ /*
|
|
+ * Nope, we can't. So, we map non-allocated blocks
|
|
+ * and start IO on them using writepage()
|
|
+ */
|
|
+ if (mpd->next_page != mpd->first_page) {
|
|
+ mpage_da_map_blocks(mpd);
|
|
+ mpage_da_submit_io(mpd);
|
|
+ /*
|
|
+ * skip rest of the page in the page_vec
|
|
+ */
|
|
+ mpd->io_done = 1;
|
|
+ redirty_page_for_writepage(wbc, page);
|
|
+ unlock_page(page);
|
|
+ return MPAGE_DA_EXTENT_TAIL;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Start next extent of pages ...
|
|
+ */
|
|
+ mpd->first_page = page->index;
|
|
+
|
|
+ /*
|
|
+ * ... and blocks
|
|
+ */
|
|
+ mpd->lbh.b_size = 0;
|
|
+ mpd->lbh.b_state = 0;
|
|
+ mpd->lbh.b_blocknr = 0;
|
|
+ }
|
|
+
|
|
+ mpd->next_page = page->index + 1;
|
|
+ logical = (sector_t) page->index <<
|
|
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
|
|
+
|
|
+ if (!page_has_buffers(page)) {
|
|
+ /*
|
|
+ * There is no attached buffer heads yet (mmap?)
|
|
+ * we treat the page asfull of dirty blocks
|
|
+ */
|
|
+ bh = &fake;
|
|
+ bh->b_size = PAGE_CACHE_SIZE;
|
|
+ bh->b_state = 0;
|
|
+ set_buffer_dirty(bh);
|
|
+ set_buffer_uptodate(bh);
|
|
+ mpage_add_bh_to_extent(mpd, logical, bh);
|
|
+ if (mpd->io_done)
|
|
+ return MPAGE_DA_EXTENT_TAIL;
|
|
+ } else {
|
|
+ /*
|
|
+ * Page with regular buffer heads, just add all dirty ones
|
|
+ */
|
|
+ head = page_buffers(page);
|
|
+ bh = head;
|
|
+ do {
|
|
+ BUG_ON(buffer_locked(bh));
|
|
+ if (buffer_dirty(bh) &&
|
|
+ (!buffer_mapped(bh) || buffer_delay(bh))) {
|
|
+ mpage_add_bh_to_extent(mpd, logical, bh);
|
|
+ if (mpd->io_done)
|
|
+ return MPAGE_DA_EXTENT_TAIL;
|
|
+ }
|
|
+ logical++;
|
|
+ } while ((bh = bh->b_this_page) != head);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * mpage_da_writepages - walk the list of dirty pages of the given
|
|
+ * address space, allocates non-allocated blocks, maps newly-allocated
|
|
+ * blocks to existing bhs and issue IO them
|
|
+ *
|
|
+ * @mapping: address space structure to write
|
|
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
|
|
+ * @get_block: the filesystem's block mapper function.
|
|
+ *
|
|
+ * This is a library function, which implements the writepages()
|
|
+ * address_space_operation.
|
|
+ */
|
|
+static int mpage_da_writepages(struct address_space *mapping,
|
|
+ struct writeback_control *wbc,
|
|
+ get_block_t get_block)
|
|
+{
|
|
+ struct mpage_da_data mpd;
|
|
+ long to_write;
|
|
+ int ret;
|
|
+
|
|
+ if (!get_block)
|
|
+ return generic_writepages(mapping, wbc);
|
|
+
|
|
+ mpd.wbc = wbc;
|
|
+ mpd.inode = mapping->host;
|
|
+ mpd.lbh.b_size = 0;
|
|
+ mpd.lbh.b_state = 0;
|
|
+ mpd.lbh.b_blocknr = 0;
|
|
+ mpd.first_page = 0;
|
|
+ mpd.next_page = 0;
|
|
+ mpd.get_block = get_block;
|
|
+ mpd.io_done = 0;
|
|
+ mpd.pages_written = 0;
|
|
+
|
|
+ to_write = wbc->nr_to_write;
|
|
+
|
|
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
|
|
+
|
|
+ /*
|
|
+ * Handle last extent of pages
|
|
+ */
|
|
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
|
|
+ mpage_da_map_blocks(&mpd);
|
|
+ mpage_da_submit_io(&mpd);
|
|
+ }
|
|
+
|
|
+ wbc->nr_to_write = to_write - mpd.pages_written;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * this is a special callback for ->write_begin() only
|
|
+ * it's intention is to return mapped block or reserve space
|
|
+ */
|
|
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
|
+ struct buffer_head *bh_result, int create)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ BUG_ON(create == 0);
|
|
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
|
|
+
|
|
+ /*
|
|
+ * first, we need to know whether the block is allocated already
|
|
+ * preallocated blocks are unmapped but should treated
|
|
+ * the same as allocated blocks.
|
|
+ */
|
|
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
|
|
+ if ((ret == 0) && !buffer_delay(bh_result)) {
|
|
+ /* the block isn't (pre)allocated yet, let's reserve space */
|
|
+ /*
|
|
+ * XXX: __block_prepare_write() unmaps passed block,
|
|
+ * is it OK?
|
|
+ */
|
|
+ ret = ext4_da_reserve_space(inode, 1);
|
|
+ if (ret)
|
|
+ /* not enough space to reserve */
|
|
+ return ret;
|
|
+
|
|
+ map_bh(bh_result, inode->i_sb, 0);
|
|
+ set_buffer_new(bh_result);
|
|
+ set_buffer_delay(bh_result);
|
|
+ } else if (ret > 0) {
|
|
+ bh_result->b_size = (ret << inode->i_blkbits);
|
|
+ ret = 0;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+#define EXT4_DELALLOC_RSVED 1
|
|
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
|
|
+ struct buffer_head *bh_result, int create)
|
|
+{
|
|
+ int ret;
|
|
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
|
|
+ loff_t disksize = EXT4_I(inode)->i_disksize;
|
|
+ handle_t *handle = NULL;
|
|
+
|
|
+ handle = ext4_journal_current_handle();
|
|
+ if (!handle) {
|
|
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
|
|
+ bh_result, 0, 0, 0);
|
|
+ BUG_ON(!ret);
|
|
+ } else {
|
|
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
|
|
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
|
|
+ }
|
|
+
|
|
+ if (ret > 0) {
|
|
+ bh_result->b_size = (ret << inode->i_blkbits);
|
|
+
|
|
+ /*
|
|
+ * Update on-disk size along with block allocation
|
|
+ * we don't use 'extend_disksize' as size may change
|
|
+ * within already allocated block -bzzz
|
|
+ */
|
|
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
|
|
+ if (disksize > i_size_read(inode))
|
|
+ disksize = i_size_read(inode);
|
|
+ if (disksize > EXT4_I(inode)->i_disksize) {
|
|
+ /*
|
|
+ * XXX: replace with spinlock if seen contended -bzzz
|
|
+ */
|
|
+ down_write(&EXT4_I(inode)->i_data_sem);
|
|
+ if (disksize > EXT4_I(inode)->i_disksize)
|
|
+ EXT4_I(inode)->i_disksize = disksize;
|
|
+ up_write(&EXT4_I(inode)->i_data_sem);
|
|
+
|
|
+ if (EXT4_I(inode)->i_disksize == disksize) {
|
|
+ ret = ext4_mark_inode_dirty(handle, inode);
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+ ret = 0;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
|
|
+{
|
|
+ /*
|
|
+ * unmapped buffer is possible for holes.
|
|
+ * delay buffer is possible with delayed allocation
|
|
+ */
|
|
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
|
|
+}
|
|
+
|
|
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
|
|
+ struct buffer_head *bh_result, int create)
|
|
+{
|
|
+ int ret = 0;
|
|
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
|
|
+
|
|
+ /*
|
|
+ * we don't want to do block allocation in writepage
|
|
+ * so call get_block_wrap with create = 0
|
|
+ */
|
|
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
|
|
+ bh_result, 0, 0, 0);
|
|
+ if (ret > 0) {
|
|
+ bh_result->b_size = (ret << inode->i_blkbits);
|
|
+ ret = 0;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
|
|
+ * get called via journal_submit_inode_data_buffers (no journal handle)
|
|
+ * get called via shrink_page_list via pdflush (no journal handle)
|
|
+ * or grab_page_cache when doing write_begin (have journal handle)
|
|
+ */
|
|
+static int ext4_da_writepage(struct page *page,
|
|
+ struct writeback_control *wbc)
|
|
+{
|
|
+ int ret = 0;
|
|
+ loff_t size;
|
|
+ unsigned long len;
|
|
+ struct buffer_head *page_bufs;
|
|
+ struct inode *inode = page->mapping->host;
|
|
+
|
|
+ size = i_size_read(inode);
|
|
+ if (page->index == size >> PAGE_CACHE_SHIFT)
|
|
+ len = size & ~PAGE_CACHE_MASK;
|
|
+ else
|
|
+ len = PAGE_CACHE_SIZE;
|
|
+
|
|
+ if (page_has_buffers(page)) {
|
|
+ page_bufs = page_buffers(page);
|
|
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
|
|
+ ext4_bh_unmapped_or_delay)) {
|
|
+ /*
|
|
+ * We don't want to do block allocation
|
|
+ * So redirty the page and return
|
|
+ * We may reach here when we do a journal commit
|
|
+ * via journal_submit_inode_data_buffers.
|
|
+ * If we don't have mapping block we just ignore
|
|
+ * them. We can also reach here via shrink_page_list
|
|
+ */
|
|
+ redirty_page_for_writepage(wbc, page);
|
|
+ unlock_page(page);
|
|
+ return 0;
|
|
+ }
|
|
+ } else {
|
|
+ /*
|
|
+ * The test for page_has_buffers() is subtle:
|
|
+ * We know the page is dirty but it lost buffers. That means
|
|
+ * that at some moment in time after write_begin()/write_end()
|
|
+ * has been called all buffers have been clean and thus they
|
|
+ * must have been written at least once. So they are all
|
|
+ * mapped and we can happily proceed with mapping them
|
|
+ * and writing the page.
|
|
+ *
|
|
+ * Try to initialize the buffer_heads and check whether
|
|
+ * all are mapped and non delay. We don't want to
|
|
+ * do block allocation here.
|
|
+ */
|
|
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
|
|
+ ext4_normal_get_block_write);
|
|
+ if (!ret) {
|
|
+ page_bufs = page_buffers(page);
|
|
+ /* check whether all are mapped and non delay */
|
|
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
|
|
+ ext4_bh_unmapped_or_delay)) {
|
|
+ redirty_page_for_writepage(wbc, page);
|
|
+ unlock_page(page);
|
|
+ return 0;
|
|
+ }
|
|
+ } else {
|
|
+ /*
|
|
+ * We can't do block allocation here
|
|
+ * so just redity the page and unlock
|
|
+ * and return
|
|
+ */
|
|
+ redirty_page_for_writepage(wbc, page);
|
|
+ unlock_page(page);
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
|
|
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
|
|
+ else
|
|
+ ret = block_write_full_page(page,
|
|
+ ext4_normal_get_block_write,
|
|
+ wbc);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This is called via ext4_da_writepages() to
|
|
+ * calulate the total number of credits to reserve to fit
|
|
+ * a single extent allocation into a single transaction,
|
|
+ * ext4_da_writpeages() will loop calling this before
|
|
+ * the block allocation.
|
|
+ */
|
|
+
|
|
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
|
|
+{
|
|
+ int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
|
|
+
|
|
+ /*
|
|
+ * With non-extent format the journal credit needed to
|
|
+ * insert nrblocks contiguous block is dependent on
|
|
+ * number of contiguous block. So we will limit
|
|
+ * number of contiguous block to a sane value
|
|
+ */
|
|
+ if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
|
|
+ (max_blocks > EXT4_MAX_TRANS_DATA))
|
|
+ max_blocks = EXT4_MAX_TRANS_DATA;
|
|
+
|
|
+ return ext4_chunk_trans_blocks(inode, max_blocks);
|
|
+}
|
|
+
|
|
+static int ext4_da_writepages(struct address_space *mapping,
|
|
+ struct writeback_control *wbc)
|
|
+{
|
|
+ handle_t *handle = NULL;
|
|
+ loff_t range_start = 0;
|
|
+ struct inode *inode = mapping->host;
|
|
+ int needed_blocks, ret = 0, nr_to_writebump = 0;
|
|
+ long to_write, pages_skipped = 0;
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
|
|
+
|
|
+ /*
|
|
+ * No pages to write? This is mainly a kludge to avoid starting
|
|
+ * a transaction for special inodes like journal inode on last iput()
|
|
+ * because that could violate lock ordering on umount
|
|
+ */
|
|
+ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
|
|
+ return 0;
|
|
+ /*
|
|
+ * Make sure nr_to_write is >= sbi->s_mb_stream_request
|
|
+ * This make sure small files blocks are allocated in
|
|
+ * single attempt. This ensure that small files
|
|
+ * get less fragmented.
|
|
+ */
|
|
+ if (wbc->nr_to_write < sbi->s_mb_stream_request) {
|
|
+ nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
|
|
+ wbc->nr_to_write = sbi->s_mb_stream_request;
|
|
+ }
|
|
+
|
|
+ if (!wbc->range_cyclic)
|
|
+ /*
|
|
+ * If range_cyclic is not set force range_cont
|
|
+ * and save the old writeback_index
|
|
+ */
|
|
+ wbc->range_cont = 1;
|
|
+
|
|
+ range_start = wbc->range_start;
|
|
+ pages_skipped = wbc->pages_skipped;
|
|
+
|
|
+restart_loop:
|
|
+ to_write = wbc->nr_to_write;
|
|
+ while (!ret && to_write > 0) {
|
|
+
|
|
+ /*
|
|
+ * we insert one extent at a time. So we need
|
|
+ * credit needed for single extent allocation.
|
|
+ * journalled mode is currently not supported
|
|
+ * by delalloc
|
|
+ */
|
|
+ BUG_ON(ext4_should_journal_data(inode));
|
|
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
|
|
+
|
|
+ /* start a new transaction*/
|
|
+ handle = ext4_journal_start(inode, needed_blocks);
|
|
+ if (IS_ERR(handle)) {
|
|
+ ret = PTR_ERR(handle);
|
|
+ printk(KERN_EMERG "%s: jbd2_start: "
|
|
+ "%ld pages, ino %lu; err %d\n", __func__,
|
|
+ wbc->nr_to_write, inode->i_ino, ret);
|
|
+ dump_stack();
|
|
+ goto out_writepages;
|
|
+ }
|
|
+ if (ext4_should_order_data(inode)) {
|
|
+ /*
|
|
+ * With ordered mode we need to add
|
|
+ * the inode to the journal handl
|
|
+ * when we do block allocation.
|
|
+ */
|
|
+ ret = ext4_jbd2_file_inode(handle, inode);
|
|
+ if (ret) {
|
|
+ ext4_journal_stop(handle);
|
|
+ goto out_writepages;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ to_write -= wbc->nr_to_write;
|
|
+ ret = mpage_da_writepages(mapping, wbc,
|
|
+ ext4_da_get_block_write);
|
|
+ ext4_journal_stop(handle);
|
|
+ if (ret == MPAGE_DA_EXTENT_TAIL) {
|
|
+ /*
|
|
+ * got one extent now try with
|
|
+ * rest of the pages
|
|
+ */
|
|
+ to_write += wbc->nr_to_write;
|
|
+ ret = 0;
|
|
+ } else if (wbc->nr_to_write) {
|
|
+ /*
|
|
+ * There is no more writeout needed
|
|
+ * or we requested for a noblocking writeout
|
|
+ * and we found the device congested
|
|
+ */
|
|
+ to_write += wbc->nr_to_write;
|
|
+ break;
|
|
+ }
|
|
+ wbc->nr_to_write = to_write;
|
|
+ }
|
|
+
|
|
+ if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
|
|
+ /* We skipped pages in this loop */
|
|
+ wbc->range_start = range_start;
|
|
+ wbc->nr_to_write = to_write +
|
|
+ wbc->pages_skipped - pages_skipped;
|
|
+ wbc->pages_skipped = pages_skipped;
|
|
+ goto restart_loop;
|
|
+ }
|
|
+
|
|
+out_writepages:
|
|
+ wbc->nr_to_write = to_write - nr_to_writebump;
|
|
+ wbc->range_start = range_start;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
|
|
+ loff_t pos, unsigned len, unsigned flags,
|
|
+ struct page **pagep, void **fsdata)
|
|
+{
|
|
+ int ret, retries = 0;
|
|
+ struct page *page;
|
|
+ pgoff_t index;
|
|
+ unsigned from, to;
|
|
+ struct inode *inode = mapping->host;
|
|
+ handle_t *handle;
|
|
+
|
|
+ index = pos >> PAGE_CACHE_SHIFT;
|
|
+ from = pos & (PAGE_CACHE_SIZE - 1);
|
|
+ to = from + len;
|
|
+
|
|
+retry:
|
|
+ /*
|
|
+ * With delayed allocation, we don't log the i_disksize update
|
|
+ * if there is delayed block allocation. But we still need
|
|
+ * to journalling the i_disksize update if writes to the end
|
|
+ * of file which has an already mapped buffer.
|
|
+ */
|
|
+ handle = ext4_journal_start(inode, 1);
|
|
+ if (IS_ERR(handle)) {
|
|
+ ret = PTR_ERR(handle);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ page = __grab_cache_page(mapping, index);
|
|
+ if (!page) {
|
|
+ ext4_journal_stop(handle);
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ *pagep = page;
|
|
+
|
|
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
|
|
+ ext4_da_get_block_prep);
|
|
+ if (ret < 0) {
|
|
+ unlock_page(page);
|
|
+ ext4_journal_stop(handle);
|
|
+ page_cache_release(page);
|
|
+ }
|
|
+
|
|
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
|
|
+ goto retry;
|
|
+out:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Check if we should update i_disksize
|
|
+ * when write to the end of file but not require block allocation
|
|
+ */
|
|
+static int ext4_da_should_update_i_disksize(struct page *page,
|
|
+ unsigned long offset)
|
|
+{
|
|
+ struct buffer_head *bh;
|
|
+ struct inode *inode = page->mapping->host;
|
|
+ unsigned int idx;
|
|
+ int i;
|
|
+
|
|
+ bh = page_buffers(page);
|
|
+ idx = offset >> inode->i_blkbits;
|
|
+
|
|
+ for (i=0; i < idx; i++)
|
|
+ bh = bh->b_this_page;
|
|
+
|
|
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
|
|
+ return 0;
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static int ext4_da_write_end(struct file *file,
|
|
+ struct address_space *mapping,
|
|
+ loff_t pos, unsigned len, unsigned copied,
|
|
+ struct page *page, void *fsdata)
|
|
+{
|
|
+ struct inode *inode = mapping->host;
|
|
+ int ret = 0, ret2;
|
|
+ handle_t *handle = ext4_journal_current_handle();
|
|
+ loff_t new_i_size;
|
|
+ unsigned long start, end;
|
|
+
|
|
+ start = pos & (PAGE_CACHE_SIZE - 1);
|
|
+ end = start + copied -1;
|
|
+
|
|
+ /*
|
|
+ * generic_write_end() will run mark_inode_dirty() if i_size
|
|
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
|
|
+ * into that.
|
|
+ */
|
|
+
|
|
+ new_i_size = pos + copied;
|
|
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
|
|
+ if (ext4_da_should_update_i_disksize(page, end)) {
|
|
+ down_write(&EXT4_I(inode)->i_data_sem);
|
|
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
|
|
+ /*
|
|
+ * Updating i_disksize when extending file
|
|
+ * without needing block allocation
|
|
+ */
|
|
+ if (ext4_should_order_data(inode))
|
|
+ ret = ext4_jbd2_file_inode(handle,
|
|
+ inode);
|
|
+
|
|
+ EXT4_I(inode)->i_disksize = new_i_size;
|
|
+ }
|
|
+ up_write(&EXT4_I(inode)->i_data_sem);
|
|
+ }
|
|
+ }
|
|
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
|
|
+ page, fsdata);
|
|
+ copied = ret2;
|
|
+ if (ret2 < 0)
|
|
+ ret = ret2;
|
|
+ ret2 = ext4_journal_stop(handle);
|
|
+ if (!ret)
|
|
+ ret = ret2;
|
|
+
|
|
+ return ret ? ret : copied;
|
|
+}
|
|
+
|
|
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
|
|
+{
|
|
+ /*
|
|
+ * Drop reserved blocks
|
|
+ */
|
|
+ BUG_ON(!PageLocked(page));
|
|
+ if (!page_has_buffers(page))
|
|
+ goto out;
|
|
+
|
|
+ ext4_da_page_release_reservation(page, offset);
|
|
+
|
|
+out:
|
|
+ ext4_invalidatepage(page, offset);
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+
|
|
/*
|
|
* bmap() is special. It gets used by applications such as lilo and by
|
|
* the swapper to find the on-disk block of a specific piece of data.
|
|
@@ -1418,6 +2552,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
|
|
journal_t *journal;
|
|
int err;
|
|
|
|
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
|
|
+ test_opt(inode->i_sb, DELALLOC)) {
|
|
+ /*
|
|
+ * With delalloc we want to sync the file
|
|
+ * so that we can make sure we allocate
|
|
+ * blocks for file
|
|
+ */
|
|
+ filemap_write_and_wait(mapping);
|
|
+ }
|
|
+
|
|
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
|
|
/*
|
|
* This is a REALLY heavyweight approach, but the use of
|
|
@@ -1462,21 +2606,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
|
|
return 0;
|
|
}
|
|
|
|
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
|
|
-{
|
|
- if (buffer_mapped(bh))
|
|
- return ext4_journal_dirty_data(handle, bh);
|
|
- return 0;
|
|
-}
|
|
-
|
|
/*
|
|
- * Note that we always start a transaction even if we're not journalling
|
|
- * data. This is to preserve ordering: any hole instantiation within
|
|
- * __block_write_full_page -> ext4_get_block() should be journalled
|
|
- * along with the data so we don't crash and then get metadata which
|
|
- * refers to old data.
|
|
+ * Note that we don't need to start a transaction unless we're journaling data
|
|
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
|
|
+ * need to file the inode to the transaction's list in ordered mode because if
|
|
+ * we are writing back data added by write(), the inode is already there and if
|
|
+ * we are writing back data modified via mmap(), noone guarantees in which
|
|
+ * transaction the data will hit the disk. In case we are journaling data, we
|
|
+ * cannot start transaction directly because transaction start ranks above page
|
|
+ * lock so we have to do some magic.
|
|
*
|
|
- * In all journalling modes block_write_full_page() will start the I/O.
|
|
+ * In all journaling modes block_write_full_page() will start the I/O.
|
|
*
|
|
* Problem:
|
|
*
|
|
@@ -1518,105 +2658,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
|
|
* disastrous. Any write() or metadata operation will sync the fs for
|
|
* us.
|
|
*
|
|
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
|
|
- * we don't need to open a transaction here.
|
|
*/
|
|
-static int ext4_ordered_writepage(struct page *page,
|
|
+static int __ext4_normal_writepage(struct page *page,
|
|
struct writeback_control *wbc)
|
|
{
|
|
struct inode *inode = page->mapping->host;
|
|
- struct buffer_head *page_bufs;
|
|
- handle_t *handle = NULL;
|
|
- int ret = 0;
|
|
- int err;
|
|
|
|
- J_ASSERT(PageLocked(page));
|
|
-
|
|
- /*
|
|
- * We give up here if we're reentered, because it might be for a
|
|
- * different filesystem.
|
|
- */
|
|
- if (ext4_journal_current_handle())
|
|
- goto out_fail;
|
|
-
|
|
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
|
|
+ if (test_opt(inode->i_sb, NOBH))
|
|
+ return nobh_writepage(page,
|
|
+ ext4_normal_get_block_write, wbc);
|
|
+ else
|
|
+ return block_write_full_page(page,
|
|
+ ext4_normal_get_block_write,
|
|
+ wbc);
|
|
+}
|
|
|
|
- if (IS_ERR(handle)) {
|
|
- ret = PTR_ERR(handle);
|
|
- goto out_fail;
|
|
- }
|
|
+static int ext4_normal_writepage(struct page *page,
|
|
+ struct writeback_control *wbc)
|
|
+{
|
|
+ struct inode *inode = page->mapping->host;
|
|
+ loff_t size = i_size_read(inode);
|
|
+ loff_t len;
|
|
|
|
- if (!page_has_buffers(page)) {
|
|
- create_empty_buffers(page, inode->i_sb->s_blocksize,
|
|
- (1 << BH_Dirty)|(1 << BH_Uptodate));
|
|
+ J_ASSERT(PageLocked(page));
|
|
+ if (page->index == size >> PAGE_CACHE_SHIFT)
|
|
+ len = size & ~PAGE_CACHE_MASK;
|
|
+ else
|
|
+ len = PAGE_CACHE_SIZE;
|
|
+
|
|
+ if (page_has_buffers(page)) {
|
|
+ /* if page has buffers it should all be mapped
|
|
+ * and allocated. If there are not buffers attached
|
|
+ * to the page we know the page is dirty but it lost
|
|
+ * buffers. That means that at some moment in time
|
|
+ * after write_begin() / write_end() has been called
|
|
+ * all buffers have been clean and thus they must have been
|
|
+ * written at least once. So they are all mapped and we can
|
|
+ * happily proceed with mapping them and writing the page.
|
|
+ */
|
|
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
|
|
+ ext4_bh_unmapped_or_delay));
|
|
}
|
|
- page_bufs = page_buffers(page);
|
|
- walk_page_buffers(handle, page_bufs, 0,
|
|
- PAGE_CACHE_SIZE, NULL, bget_one);
|
|
-
|
|
- ret = block_write_full_page(page, ext4_get_block, wbc);
|
|
-
|
|
- /*
|
|
- * The page can become unlocked at any point now, and
|
|
- * truncate can then come in and change things. So we
|
|
- * can't touch *page from now on. But *page_bufs is
|
|
- * safe due to elevated refcount.
|
|
- */
|
|
|
|
- /*
|
|
- * And attach them to the current transaction. But only if
|
|
- * block_write_full_page() succeeded. Otherwise they are unmapped,
|
|
- * and generally junk.
|
|
- */
|
|
- if (ret == 0) {
|
|
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
|
|
- NULL, jbd2_journal_dirty_data_fn);
|
|
- if (!ret)
|
|
- ret = err;
|
|
- }
|
|
- walk_page_buffers(handle, page_bufs, 0,
|
|
- PAGE_CACHE_SIZE, NULL, bput_one);
|
|
- err = ext4_journal_stop(handle);
|
|
- if (!ret)
|
|
- ret = err;
|
|
- return ret;
|
|
+ if (!ext4_journal_current_handle())
|
|
+ return __ext4_normal_writepage(page, wbc);
|
|
|
|
-out_fail:
|
|
redirty_page_for_writepage(wbc, page);
|
|
unlock_page(page);
|
|
- return ret;
|
|
+ return 0;
|
|
}
|
|
|
|
-static int ext4_writeback_writepage(struct page *page,
|
|
+static int __ext4_journalled_writepage(struct page *page,
|
|
struct writeback_control *wbc)
|
|
{
|
|
- struct inode *inode = page->mapping->host;
|
|
+ struct address_space *mapping = page->mapping;
|
|
+ struct inode *inode = mapping->host;
|
|
+ struct buffer_head *page_bufs;
|
|
handle_t *handle = NULL;
|
|
int ret = 0;
|
|
int err;
|
|
|
|
- if (ext4_journal_current_handle())
|
|
- goto out_fail;
|
|
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
|
|
+ ext4_normal_get_block_write);
|
|
+ if (ret != 0)
|
|
+ goto out_unlock;
|
|
+
|
|
+ page_bufs = page_buffers(page);
|
|
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
|
|
+ bget_one);
|
|
+ /* As soon as we unlock the page, it can go away, but we have
|
|
+ * references to buffers so we are safe */
|
|
+ unlock_page(page);
|
|
|
|
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
|
|
if (IS_ERR(handle)) {
|
|
ret = PTR_ERR(handle);
|
|
- goto out_fail;
|
|
+ goto out;
|
|
}
|
|
|
|
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
|
|
- ret = nobh_writepage(page, ext4_get_block, wbc);
|
|
- else
|
|
- ret = block_write_full_page(page, ext4_get_block, wbc);
|
|
+ ret = walk_page_buffers(handle, page_bufs, 0,
|
|
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
|
|
|
|
+ err = walk_page_buffers(handle, page_bufs, 0,
|
|
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
|
|
+ if (ret == 0)
|
|
+ ret = err;
|
|
err = ext4_journal_stop(handle);
|
|
if (!ret)
|
|
ret = err;
|
|
- return ret;
|
|
|
|
-out_fail:
|
|
- redirty_page_for_writepage(wbc, page);
|
|
+ walk_page_buffers(handle, page_bufs, 0,
|
|
+ PAGE_CACHE_SIZE, NULL, bput_one);
|
|
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
|
|
+ goto out;
|
|
+
|
|
+out_unlock:
|
|
unlock_page(page);
|
|
+out:
|
|
return ret;
|
|
}
|
|
|
|
@@ -1624,59 +2762,53 @@ static int ext4_journalled_writepage(struct page *page,
|
|
struct writeback_control *wbc)
|
|
{
|
|
struct inode *inode = page->mapping->host;
|
|
- handle_t *handle = NULL;
|
|
- int ret = 0;
|
|
- int err;
|
|
+ loff_t size = i_size_read(inode);
|
|
+ loff_t len;
|
|
|
|
- if (ext4_journal_current_handle())
|
|
- goto no_write;
|
|
+ J_ASSERT(PageLocked(page));
|
|
+ if (page->index == size >> PAGE_CACHE_SHIFT)
|
|
+ len = size & ~PAGE_CACHE_MASK;
|
|
+ else
|
|
+ len = PAGE_CACHE_SIZE;
|
|
+
|
|
+ if (page_has_buffers(page)) {
|
|
+ /* if page has buffers it should all be mapped
|
|
+ * and allocated. If there are not buffers attached
|
|
+ * to the page we know the page is dirty but it lost
|
|
+ * buffers. That means that at some moment in time
|
|
+ * after write_begin() / write_end() has been called
|
|
+ * all buffers have been clean and thus they must have been
|
|
+ * written at least once. So they are all mapped and we can
|
|
+ * happily proceed with mapping them and writing the page.
|
|
+ */
|
|
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
|
|
+ ext4_bh_unmapped_or_delay));
|
|
+ }
|
|
|
|
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
|
|
- if (IS_ERR(handle)) {
|
|
- ret = PTR_ERR(handle);
|
|
+ if (ext4_journal_current_handle())
|
|
goto no_write;
|
|
- }
|
|
|
|
- if (!page_has_buffers(page) || PageChecked(page)) {
|
|
+ if (PageChecked(page)) {
|
|
/*
|
|
* It's mmapped pagecache. Add buffers and journal it. There
|
|
* doesn't seem much point in redirtying the page here.
|
|
*/
|
|
ClearPageChecked(page);
|
|
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
|
|
- ext4_get_block);
|
|
- if (ret != 0) {
|
|
- ext4_journal_stop(handle);
|
|
- goto out_unlock;
|
|
- }
|
|
- ret = walk_page_buffers(handle, page_buffers(page), 0,
|
|
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
|
|
-
|
|
- err = walk_page_buffers(handle, page_buffers(page), 0,
|
|
- PAGE_CACHE_SIZE, NULL, write_end_fn);
|
|
- if (ret == 0)
|
|
- ret = err;
|
|
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
|
|
- unlock_page(page);
|
|
+ return __ext4_journalled_writepage(page, wbc);
|
|
} else {
|
|
/*
|
|
* It may be a page full of checkpoint-mode buffers. We don't
|
|
* really know unless we go poke around in the buffer_heads.
|
|
* But block_write_full_page will do the right thing.
|
|
*/
|
|
- ret = block_write_full_page(page, ext4_get_block, wbc);
|
|
+ return block_write_full_page(page,
|
|
+ ext4_normal_get_block_write,
|
|
+ wbc);
|
|
}
|
|
- err = ext4_journal_stop(handle);
|
|
- if (!ret)
|
|
- ret = err;
|
|
-out:
|
|
- return ret;
|
|
-
|
|
no_write:
|
|
redirty_page_for_writepage(wbc, page);
|
|
-out_unlock:
|
|
unlock_page(page);
|
|
- goto out;
|
|
+ return 0;
|
|
}
|
|
|
|
static int ext4_readpage(struct file *file, struct page *page)
|
|
@@ -1819,7 +2951,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
|
|
static const struct address_space_operations ext4_ordered_aops = {
|
|
.readpage = ext4_readpage,
|
|
.readpages = ext4_readpages,
|
|
- .writepage = ext4_ordered_writepage,
|
|
+ .writepage = ext4_normal_writepage,
|
|
.sync_page = block_sync_page,
|
|
.write_begin = ext4_write_begin,
|
|
.write_end = ext4_ordered_write_end,
|
|
@@ -1833,7 +2965,7 @@ static const struct address_space_operations ext4_ordered_aops = {
|
|
static const struct address_space_operations ext4_writeback_aops = {
|
|
.readpage = ext4_readpage,
|
|
.readpages = ext4_readpages,
|
|
- .writepage = ext4_writeback_writepage,
|
|
+ .writepage = ext4_normal_writepage,
|
|
.sync_page = block_sync_page,
|
|
.write_begin = ext4_write_begin,
|
|
.write_end = ext4_writeback_write_end,
|
|
@@ -1857,10 +2989,31 @@ static const struct address_space_operations ext4_journalled_aops = {
|
|
.releasepage = ext4_releasepage,
|
|
};
|
|
|
|
+static const struct address_space_operations ext4_da_aops = {
|
|
+ .readpage = ext4_readpage,
|
|
+ .readpages = ext4_readpages,
|
|
+ .writepage = ext4_da_writepage,
|
|
+ .writepages = ext4_da_writepages,
|
|
+ .sync_page = block_sync_page,
|
|
+ .write_begin = ext4_da_write_begin,
|
|
+ .write_end = ext4_da_write_end,
|
|
+ .bmap = ext4_bmap,
|
|
+ .invalidatepage = ext4_da_invalidatepage,
|
|
+ .releasepage = ext4_releasepage,
|
|
+ .direct_IO = ext4_direct_IO,
|
|
+ .migratepage = buffer_migrate_page,
|
|
+};
|
|
+
|
|
void ext4_set_aops(struct inode *inode)
|
|
{
|
|
- if (ext4_should_order_data(inode))
|
|
+ if (ext4_should_order_data(inode) &&
|
|
+ test_opt(inode->i_sb, DELALLOC))
|
|
+ inode->i_mapping->a_ops = &ext4_da_aops;
|
|
+ else if (ext4_should_order_data(inode))
|
|
inode->i_mapping->a_ops = &ext4_ordered_aops;
|
|
+ else if (ext4_should_writeback_data(inode) &&
|
|
+ test_opt(inode->i_sb, DELALLOC))
|
|
+ inode->i_mapping->a_ops = &ext4_da_aops;
|
|
else if (ext4_should_writeback_data(inode))
|
|
inode->i_mapping->a_ops = &ext4_writeback_aops;
|
|
else
|
|
@@ -1873,7 +3026,7 @@ void ext4_set_aops(struct inode *inode)
|
|
* This required during truncate. We need to physically zero the tail end
|
|
* of that block so it doesn't yield old data if the file is later grown.
|
|
*/
|
|
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
|
|
+int ext4_block_truncate_page(handle_t *handle,
|
|
struct address_space *mapping, loff_t from)
|
|
{
|
|
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
|
|
@@ -1882,8 +3035,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
|
|
ext4_lblk_t iblock;
|
|
struct inode *inode = mapping->host;
|
|
struct buffer_head *bh;
|
|
+ struct page *page;
|
|
int err = 0;
|
|
|
|
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
|
|
+ if (!page)
|
|
+ return -EINVAL;
|
|
+
|
|
blocksize = inode->i_sb->s_blocksize;
|
|
length = blocksize - (offset & (blocksize - 1));
|
|
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
|
|
@@ -1956,7 +3114,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
|
|
err = ext4_journal_dirty_metadata(handle, bh);
|
|
} else {
|
|
if (ext4_should_order_data(inode))
|
|
- err = ext4_journal_dirty_data(handle, bh);
|
|
+ err = ext4_jbd2_file_inode(handle, inode);
|
|
mark_buffer_dirty(bh);
|
|
}
|
|
|
|
@@ -2179,7 +3337,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
|
|
|
|
if (this_bh) {
|
|
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
|
|
- ext4_journal_dirty_metadata(handle, this_bh);
|
|
+
|
|
+ /*
|
|
+ * The buffer head should have an attached journal head at this
|
|
+ * point. However, if the data is corrupted and an indirect
|
|
+ * block pointed to itself, it would have been detached when
|
|
+ * the block was cleared. Check for this instead of OOPSing.
|
|
+ */
|
|
+ if (bh2jh(this_bh))
|
|
+ ext4_journal_dirty_metadata(handle, this_bh);
|
|
+ else
|
|
+ ext4_error(inode->i_sb, __func__,
|
|
+ "circular indirect block detected, "
|
|
+ "inode=%lu, block=%llu",
|
|
+ inode->i_ino,
|
|
+ (unsigned long long) this_bh->b_blocknr);
|
|
}
|
|
}
|
|
|
|
@@ -2305,6 +3477,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
|
|
}
|
|
}
|
|
|
|
+int ext4_can_truncate(struct inode *inode)
|
|
+{
|
|
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
|
|
+ return 0;
|
|
+ if (S_ISREG(inode->i_mode))
|
|
+ return 1;
|
|
+ if (S_ISDIR(inode->i_mode))
|
|
+ return 1;
|
|
+ if (S_ISLNK(inode->i_mode))
|
|
+ return !ext4_inode_is_fast_symlink(inode);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/*
|
|
* ext4_truncate()
|
|
*
|
|
@@ -2347,51 +3532,25 @@ void ext4_truncate(struct inode *inode)
|
|
int n;
|
|
ext4_lblk_t last_block;
|
|
unsigned blocksize = inode->i_sb->s_blocksize;
|
|
- struct page *page;
|
|
|
|
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
|
|
- S_ISLNK(inode->i_mode)))
|
|
- return;
|
|
- if (ext4_inode_is_fast_symlink(inode))
|
|
- return;
|
|
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
|
|
+ if (!ext4_can_truncate(inode))
|
|
return;
|
|
|
|
- /*
|
|
- * We have to lock the EOF page here, because lock_page() nests
|
|
- * outside jbd2_journal_start().
|
|
- */
|
|
- if ((inode->i_size & (blocksize - 1)) == 0) {
|
|
- /* Block boundary? Nothing to do */
|
|
- page = NULL;
|
|
- } else {
|
|
- page = grab_cache_page(mapping,
|
|
- inode->i_size >> PAGE_CACHE_SHIFT);
|
|
- if (!page)
|
|
- return;
|
|
- }
|
|
-
|
|
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
|
|
- ext4_ext_truncate(inode, page);
|
|
+ ext4_ext_truncate(inode);
|
|
return;
|
|
}
|
|
|
|
handle = start_transaction(inode);
|
|
- if (IS_ERR(handle)) {
|
|
- if (page) {
|
|
- clear_highpage(page);
|
|
- flush_dcache_page(page);
|
|
- unlock_page(page);
|
|
- page_cache_release(page);
|
|
- }
|
|
+ if (IS_ERR(handle))
|
|
return; /* AKPM: return what? */
|
|
- }
|
|
|
|
last_block = (inode->i_size + blocksize-1)
|
|
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
|
|
|
|
- if (page)
|
|
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
|
|
+ if (inode->i_size & (blocksize - 1))
|
|
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
|
|
+ goto out_stop;
|
|
|
|
n = ext4_block_to_path(inode, last_block, offsets, NULL);
|
|
if (n == 0)
|
|
@@ -2410,6 +3569,14 @@ void ext4_truncate(struct inode *inode)
|
|
goto out_stop;
|
|
|
|
/*
|
|
+ * From here we block out all ext4_get_block() callers who want to
|
|
+ * modify the block allocation tree.
|
|
+ */
|
|
+ down_write(&ei->i_data_sem);
|
|
+
|
|
+ ext4_discard_reservation(inode);
|
|
+
|
|
+ /*
|
|
* The orphan list entry will now protect us from any crash which
|
|
* occurs before the truncate completes, so it is now safe to propagate
|
|
* the new, shorter inode size (held for now in i_size) into the
|
|
@@ -2418,12 +3585,6 @@ void ext4_truncate(struct inode *inode)
|
|
*/
|
|
ei->i_disksize = inode->i_size;
|
|
|
|
- /*
|
|
- * From here we block out all ext4_get_block() callers who want to
|
|
- * modify the block allocation tree.
|
|
- */
|
|
- down_write(&ei->i_data_sem);
|
|
-
|
|
if (n == 1) { /* direct blocks */
|
|
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
|
|
i_data + EXT4_NDIR_BLOCKS);
|
|
@@ -2484,8 +3645,6 @@ do_indirects:
|
|
;
|
|
}
|
|
|
|
- ext4_discard_reservation(inode);
|
|
-
|
|
up_write(&ei->i_data_sem);
|
|
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
|
|
ext4_mark_inode_dirty(handle, inode);
|
|
@@ -2571,6 +3730,16 @@ static int __ext4_get_inode_loc(struct inode *inode,
|
|
}
|
|
if (!buffer_uptodate(bh)) {
|
|
lock_buffer(bh);
|
|
+
|
|
+ /*
|
|
+ * If the buffer has the write error flag, we have failed
|
|
+ * to write out another inode in the same block. In this
|
|
+ * case, we don't have to read the block because we may
|
|
+ * read the old inode data successfully.
|
|
+ */
|
|
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
|
|
+ set_buffer_uptodate(bh);
|
|
+
|
|
if (buffer_uptodate(bh)) {
|
|
/* someone brought it uptodate while we waited */
|
|
unlock_buffer(bh);
|
|
@@ -3107,7 +4276,14 @@ int ext4_write_inode(struct inode *inode, int wait)
|
|
* be freed, so we have a strong guarantee that no future commit will
|
|
* leave these blocks visible to the user.)
|
|
*
|
|
- * Called with inode->sem down.
|
|
+ * Another thing we have to assure is that if we are in ordered mode
|
|
+ * and inode is still attached to the committing transaction, we must
|
|
+ * we start writeout of all the dirty pages which are being truncated.
|
|
+ * This way we are sure that all the data written in the previous
|
|
+ * transaction are already on disk (truncate waits for pages under
|
|
+ * writeback).
|
|
+ *
|
|
+ * Called with inode->i_mutex down.
|
|
*/
|
|
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
|
|
{
|
|
@@ -3173,6 +4349,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
|
|
if (!error)
|
|
error = rc;
|
|
ext4_journal_stop(handle);
|
|
+
|
|
+ if (ext4_should_order_data(inode)) {
|
|
+ error = ext4_begin_ordered_truncate(inode,
|
|
+ attr->ia_size);
|
|
+ if (error) {
|
|
+ /* Do as much error cleanup as possible */
|
|
+ handle = ext4_journal_start(inode, 3);
|
|
+ if (IS_ERR(handle)) {
|
|
+ ext4_orphan_del(NULL, inode);
|
|
+ goto err_out;
|
|
+ }
|
|
+ ext4_orphan_del(handle, inode);
|
|
+ ext4_journal_stop(handle);
|
|
+ goto err_out;
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
rc = inode_setattr(inode, attr);
|
|
@@ -3193,58 +4385,156 @@ err_out:
|
|
return error;
|
|
}
|
|
|
|
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
|
+ struct kstat *stat)
|
|
+{
|
|
+ struct inode *inode;
|
|
+ unsigned long delalloc_blocks;
|
|
+
|
|
+ inode = dentry->d_inode;
|
|
+ generic_fillattr(inode, stat);
|
|
|
|
+ /*
|
|
+ * We can't update i_blocks if the block allocation is delayed
|
|
+ * otherwise in the case of system crash before the real block
|
|
+ * allocation is done, we will have i_blocks inconsistent with
|
|
+ * on-disk file blocks.
|
|
+ * We always keep i_blocks updated together with real
|
|
+ * allocation. But to not confuse with user, stat
|
|
+ * will return the blocks that include the delayed allocation
|
|
+ * blocks for this file.
|
|
+ */
|
|
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
|
|
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
|
|
+
|
|
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
|
|
+ int chunk)
|
|
+{
|
|
+ int indirects;
|
|
+
|
|
+ /* if nrblocks are contiguous */
|
|
+ if (chunk) {
|
|
+ /*
|
|
+ * With N contiguous data blocks, it need at most
|
|
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
|
|
+ * 2 dindirect blocks
|
|
+ * 1 tindirect block
|
|
+ */
|
|
+ indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
|
|
+ return indirects + 3;
|
|
+ }
|
|
+ /*
|
|
+ * if nrblocks are not contiguous, worse case, each block touch
|
|
+ * a indirect block, and each indirect block touch a double indirect
|
|
+ * block, plus a triple indirect block
|
|
+ */
|
|
+ indirects = nrblocks * 2 + 1;
|
|
+ return indirects;
|
|
+}
|
|
+
|
|
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
|
|
+{
|
|
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
|
|
+ return ext4_indirect_trans_blocks(inode, nrblocks, 0);
|
|
+ return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
|
|
+}
|
|
/*
|
|
- * How many blocks doth make a writepage()?
|
|
- *
|
|
- * With N blocks per page, it may be:
|
|
- * N data blocks
|
|
- * 2 indirect block
|
|
- * 2 dindirect
|
|
- * 1 tindirect
|
|
- * N+5 bitmap blocks (from the above)
|
|
- * N+5 group descriptor summary blocks
|
|
- * 1 inode block
|
|
- * 1 superblock.
|
|
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
|
|
- *
|
|
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
|
|
- *
|
|
- * With ordered or writeback data it's the same, less the N data blocks.
|
|
- *
|
|
- * If the inode's direct blocks can hold an integral number of pages then a
|
|
- * page cannot straddle two indirect blocks, and we can only touch one indirect
|
|
- * and dindirect block, and the "5" above becomes "3".
|
|
- *
|
|
- * This still overestimates under most circumstances. If we were to pass the
|
|
- * start and end offsets in here as well we could do block_to_path() on each
|
|
- * block and work out the exact number of indirects which are touched. Pah.
|
|
+ * Account for index blocks, block groups bitmaps and block group
|
|
+ * descriptor blocks if modify datablocks and index blocks
|
|
+ * worse case, the indexs blocks spread over different block groups
|
|
+ *
|
|
+ * If datablocks are discontiguous, they are possible to spread over
|
|
+ * different block groups too. If they are contiugous, with flexbg,
|
|
+ * they could still across block group boundary.
|
|
+ *
|
|
+ * Also account for superblock, inode, quota and xattr blocks
|
|
*/
|
|
+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
|
|
+{
|
|
+ int groups, gdpblocks;
|
|
+ int idxblocks;
|
|
+ int ret = 0;
|
|
+
|
|
+ /*
|
|
+ * How many index blocks need to touch to modify nrblocks?
|
|
+ * The "Chunk" flag indicating whether the nrblocks is
|
|
+ * physically contiguous on disk
|
|
+ *
|
|
+ * For Direct IO and fallocate, they calls get_block to allocate
|
|
+ * one single extent at a time, so they could set the "Chunk" flag
|
|
+ */
|
|
+ idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
|
|
|
|
+ ret = idxblocks;
|
|
+
|
|
+ /*
|
|
+ * Now let's see how many group bitmaps and group descriptors need
|
|
+ * to account
|
|
+ */
|
|
+ groups = idxblocks;
|
|
+ if (chunk)
|
|
+ groups += 1;
|
|
+ else
|
|
+ groups += nrblocks;
|
|
+
|
|
+ gdpblocks = groups;
|
|
+ if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
|
|
+ groups = EXT4_SB(inode->i_sb)->s_groups_count;
|
|
+ if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
|
|
+ gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
|
|
+
|
|
+ /* bitmaps and block group descriptor blocks */
|
|
+ ret += groups + gdpblocks;
|
|
+
|
|
+ /* Blocks for super block, inode, quota and xattr blocks */
|
|
+ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Calulate the total number of credits to reserve to fit
|
|
+ * the modification of a single pages into a single transaction,
|
|
+ * which may include multiple chunks of block allocations.
|
|
+ *
|
|
+ * This could be called via ext4_write_begin()
|
|
+ *
|
|
+ * We need to consider the worse case, when
|
|
+ * one new block per extent.
|
|
+ */
|
|
int ext4_writepage_trans_blocks(struct inode *inode)
|
|
{
|
|
int bpp = ext4_journal_blocks_per_page(inode);
|
|
- int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
|
|
int ret;
|
|
|
|
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
|
|
- return ext4_ext_writepage_trans_blocks(inode, bpp);
|
|
+ ret = ext4_meta_trans_blocks(inode, bpp, 0);
|
|
|
|
+ /* Account for data blocks for journalled mode */
|
|
if (ext4_should_journal_data(inode))
|
|
- ret = 3 * (bpp + indirects) + 2;
|
|
- else
|
|
- ret = 2 * (bpp + indirects) + 2;
|
|
-
|
|
-#ifdef CONFIG_QUOTA
|
|
- /* We know that structure was already allocated during DQUOT_INIT so
|
|
- * we will be updating only the data blocks + inodes */
|
|
- ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
|
|
-#endif
|
|
-
|
|
+ ret += bpp;
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
+ * Calculate the journal credits for a chunk of data modification.
|
|
+ *
|
|
+ * This is called from DIO, fallocate or whoever calling
|
|
+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
|
|
+ *
|
|
+ * journal buffers for data blocks are not included here, as DIO
|
|
+ * and fallocate do no need to journal data buffers.
|
|
+ */
|
|
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
|
|
+{
|
|
+ return ext4_meta_trans_blocks(inode, nrblocks, 1);
|
|
+}
|
|
+
|
|
+/*
|
|
* The caller must have previously called ext4_reserve_inode_write().
|
|
* Give this, we know that the caller already has write access to iloc->bh.
|
|
*/
|
|
@@ -3506,3 +4796,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
|
|
|
|
return err;
|
|
}
|
|
+
|
|
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
|
|
+{
|
|
+ return !buffer_mapped(bh);
|
|
+}
|
|
+
|
|
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
|
|
+{
|
|
+ loff_t size;
|
|
+ unsigned long len;
|
|
+ int ret = -EINVAL;
|
|
+ struct file *file = vma->vm_file;
|
|
+ struct inode *inode = file->f_path.dentry->d_inode;
|
|
+ struct address_space *mapping = inode->i_mapping;
|
|
+
|
|
+ /*
|
|
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
|
|
+ * get i_mutex because we are already holding mmap_sem.
|
|
+ */
|
|
+ down_read(&inode->i_alloc_sem);
|
|
+ size = i_size_read(inode);
|
|
+ if (page->mapping != mapping || size <= page_offset(page)
|
|
+ || !PageUptodate(page)) {
|
|
+ /* page got truncated from under us? */
|
|
+ goto out_unlock;
|
|
+ }
|
|
+ ret = 0;
|
|
+ if (PageMappedToDisk(page))
|
|
+ goto out_unlock;
|
|
+
|
|
+ if (page->index == size >> PAGE_CACHE_SHIFT)
|
|
+ len = size & ~PAGE_CACHE_MASK;
|
|
+ else
|
|
+ len = PAGE_CACHE_SIZE;
|
|
+
|
|
+ if (page_has_buffers(page)) {
|
|
+ /* return if we have all the buffers mapped */
|
|
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
|
|
+ ext4_bh_unmapped))
|
|
+ goto out_unlock;
|
|
+ }
|
|
+ /*
|
|
+ * OK, we need to fill the hole... Do write_begin write_end
|
|
+ * to do block allocation/reservation.We are not holding
|
|
+ * inode.i__mutex here. That allow * parallel write_begin,
|
|
+ * write_end call. lock_page prevent this from happening
|
|
+ * on the same page though
|
|
+ */
|
|
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
|
|
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
|
|
+ if (ret < 0)
|
|
+ goto out_unlock;
|
|
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
|
|
+ len, len, page, NULL);
|
|
+ if (ret < 0)
|
|
+ goto out_unlock;
|
|
+ ret = 0;
|
|
+out_unlock:
|
|
+ up_read(&inode->i_alloc_sem);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
|
|
index c9900aa..e0e3a5e 100644
|
|
--- a/fs/ext4/mballoc.c
|
|
+++ b/fs/ext4/mballoc.c
|
|
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
|
|
|
|
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
|
|
{
|
|
- int fix = 0;
|
|
+ int fix = 0, ret, tmpmax;
|
|
addr = mb_correct_addr_and_bit(&fix, addr);
|
|
- max += fix;
|
|
+ tmpmax = max + fix;
|
|
start += fix;
|
|
|
|
- return ext4_find_next_zero_bit(addr, max, start) - fix;
|
|
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
|
|
+ if (ret > max)
|
|
+ return max;
|
|
+ return ret;
|
|
}
|
|
|
|
static inline int mb_find_next_bit(void *addr, int max, int start)
|
|
{
|
|
- int fix = 0;
|
|
+ int fix = 0, ret, tmpmax;
|
|
addr = mb_correct_addr_and_bit(&fix, addr);
|
|
- max += fix;
|
|
+ tmpmax = max + fix;
|
|
start += fix;
|
|
|
|
- return ext4_find_next_bit(addr, max, start) - fix;
|
|
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
|
|
+ if (ret > max)
|
|
+ return max;
|
|
+ return ret;
|
|
}
|
|
|
|
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
|
|
@@ -781,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
|
|
if (bh_uptodate_or_lock(bh[i]))
|
|
continue;
|
|
|
|
+ spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
|
|
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
|
|
ext4_init_block_bitmap(sb, bh[i],
|
|
first_group + i, desc);
|
|
set_buffer_uptodate(bh[i]);
|
|
unlock_buffer(bh[i]);
|
|
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
|
|
continue;
|
|
}
|
|
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
|
|
get_bh(bh[i]);
|
|
bh[i]->b_end_io = end_buffer_read_sync;
|
|
submit_bh(READ, bh[i]);
|
|
@@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
|
|
if (!buffer_uptodate(bh[i]))
|
|
goto out;
|
|
|
|
+ err = 0;
|
|
first_block = page->index * blocks_per_page;
|
|
for (i = 0; i < blocks_per_page; i++) {
|
|
int group;
|
|
@@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
|
|
int pnum;
|
|
int poff;
|
|
struct page *page;
|
|
+ int ret;
|
|
|
|
mb_debug("load group %lu\n", group);
|
|
|
|
@@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
|
|
if (page) {
|
|
BUG_ON(page->mapping != inode->i_mapping);
|
|
if (!PageUptodate(page)) {
|
|
- ext4_mb_init_cache(page, NULL);
|
|
+ ret = ext4_mb_init_cache(page, NULL);
|
|
+ if (ret) {
|
|
+ unlock_page(page);
|
|
+ goto err;
|
|
+ }
|
|
mb_cmp_bitmaps(e4b, page_address(page) +
|
|
(poff * sb->s_blocksize));
|
|
}
|
|
unlock_page(page);
|
|
}
|
|
}
|
|
- if (page == NULL || !PageUptodate(page))
|
|
+ if (page == NULL || !PageUptodate(page)) {
|
|
+ ret = -EIO;
|
|
goto err;
|
|
+ }
|
|
e4b->bd_bitmap_page = page;
|
|
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
|
|
mark_page_accessed(page);
|
|
@@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
|
|
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
|
|
if (page) {
|
|
BUG_ON(page->mapping != inode->i_mapping);
|
|
- if (!PageUptodate(page))
|
|
- ext4_mb_init_cache(page, e4b->bd_bitmap);
|
|
-
|
|
+ if (!PageUptodate(page)) {
|
|
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
|
|
+ if (ret) {
|
|
+ unlock_page(page);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
unlock_page(page);
|
|
}
|
|
}
|
|
- if (page == NULL || !PageUptodate(page))
|
|
+ if (page == NULL || !PageUptodate(page)) {
|
|
+ ret = -EIO;
|
|
goto err;
|
|
+ }
|
|
e4b->bd_buddy_page = page;
|
|
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
|
|
mark_page_accessed(page);
|
|
@@ -962,7 +985,7 @@ err:
|
|
page_cache_release(e4b->bd_buddy_page);
|
|
e4b->bd_buddy = NULL;
|
|
e4b->bd_bitmap = NULL;
|
|
- return -EIO;
|
|
+ return ret;
|
|
}
|
|
|
|
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
|
|
@@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
|
|
}
|
|
}
|
|
|
|
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
|
|
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
|
|
int first, int count)
|
|
{
|
|
int block = 0;
|
|
@@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
|
|
blocknr += block;
|
|
blocknr +=
|
|
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
|
|
-
|
|
+ ext4_unlock_group(sb, e4b->bd_group);
|
|
ext4_error(sb, __func__, "double-free of inode"
|
|
" %lu's block %llu(bit %u in group %lu)\n",
|
|
inode ? inode->i_ino : 0, blocknr, block,
|
|
e4b->bd_group);
|
|
+ ext4_lock_group(sb, e4b->bd_group);
|
|
}
|
|
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
|
|
e4b->bd_info->bb_counters[order]++;
|
|
@@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
|
|
} while (1);
|
|
}
|
|
mb_check_buddy(e4b);
|
|
-
|
|
- return 0;
|
|
}
|
|
|
|
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
|
|
@@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
|
|
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
|
|
spin_unlock(&sbi->s_md_lock);
|
|
}
|
|
-
|
|
- /* searching for the right group start from the goal value specified */
|
|
- group = ac->ac_g_ex.fe_group;
|
|
-
|
|
/* Let's just scan groups to find more-less suitable blocks */
|
|
cr = ac->ac_2order ? 0 : 1;
|
|
/*
|
|
@@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
|
|
repeat:
|
|
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
|
|
ac->ac_criteria = cr;
|
|
+ /*
|
|
+ * searching for the right group start
|
|
+ * from the goal value specified
|
|
+ */
|
|
+ group = ac->ac_g_ex.fe_group;
|
|
+
|
|
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
|
|
struct ext4_group_info *grp;
|
|
struct ext4_group_desc *desc;
|
|
@@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
|
|
int rc;
|
|
int size;
|
|
|
|
+ if (unlikely(sbi->s_mb_history == NULL))
|
|
+ return -ENOMEM;
|
|
s = kmalloc(sizeof(*s), GFP_KERNEL);
|
|
if (s == NULL)
|
|
return -ENOMEM;
|
|
@@ -2165,9 +2191,7 @@ static void ext4_mb_history_init(struct super_block *sb)
|
|
sbi->s_mb_history_cur = 0;
|
|
spin_lock_init(&sbi->s_mb_history_lock);
|
|
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
|
|
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
|
|
- if (likely(sbi->s_mb_history != NULL))
|
|
- memset(sbi->s_mb_history, 0, i);
|
|
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
|
|
/* if we can't allocate history, then we simple won't use it */
|
|
}
|
|
|
|
@@ -2215,21 +2239,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
|
|
#define ext4_mb_history_init(sb)
|
|
#endif
|
|
|
|
+
|
|
+/* Create and initialize ext4_group_info data for the given group. */
|
|
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
|
|
+ struct ext4_group_desc *desc)
|
|
+{
|
|
+ int i, len;
|
|
+ int metalen = 0;
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
+ struct ext4_group_info **meta_group_info;
|
|
+
|
|
+ /*
|
|
+ * First check if this group is the first of a reserved block.
|
|
+ * If it's true, we have to allocate a new table of pointers
|
|
+ * to ext4_group_info structures
|
|
+ */
|
|
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
|
|
+ metalen = sizeof(*meta_group_info) <<
|
|
+ EXT4_DESC_PER_BLOCK_BITS(sb);
|
|
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
|
|
+ if (meta_group_info == NULL) {
|
|
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
|
|
+ "buddy group\n");
|
|
+ goto exit_meta_group_info;
|
|
+ }
|
|
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
|
|
+ meta_group_info;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * calculate needed size. if change bb_counters size,
|
|
+ * don't forget about ext4_mb_generate_buddy()
|
|
+ */
|
|
+ len = offsetof(typeof(**meta_group_info),
|
|
+ bb_counters[sb->s_blocksize_bits + 2]);
|
|
+
|
|
+ meta_group_info =
|
|
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
|
|
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
|
|
+
|
|
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
|
|
+ if (meta_group_info[i] == NULL) {
|
|
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
|
|
+ goto exit_group_info;
|
|
+ }
|
|
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
|
|
+ &(meta_group_info[i]->bb_state));
|
|
+
|
|
+ /*
|
|
+ * initialize bb_free to be able to skip
|
|
+ * empty groups without initialization
|
|
+ */
|
|
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
|
|
+ meta_group_info[i]->bb_free =
|
|
+ ext4_free_blocks_after_init(sb, group, desc);
|
|
+ } else {
|
|
+ meta_group_info[i]->bb_free =
|
|
+ le16_to_cpu(desc->bg_free_blocks_count);
|
|
+ }
|
|
+
|
|
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
|
|
+
|
|
+#ifdef DOUBLE_CHECK
|
|
+ {
|
|
+ struct buffer_head *bh;
|
|
+ meta_group_info[i]->bb_bitmap =
|
|
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
|
|
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
|
|
+ bh = ext4_read_block_bitmap(sb, group);
|
|
+ BUG_ON(bh == NULL);
|
|
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
|
|
+ sb->s_blocksize);
|
|
+ put_bh(bh);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ return 0;
|
|
+
|
|
+exit_group_info:
|
|
+ /* If a meta_group_info table has been allocated, release it now */
|
|
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
|
|
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
|
|
+exit_meta_group_info:
|
|
+ return -ENOMEM;
|
|
+} /* ext4_mb_add_groupinfo */
|
|
+
|
|
+/*
|
|
+ * Add a group to the existing groups.
|
|
+ * This function is used for online resize
|
|
+ */
|
|
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
|
|
+ struct ext4_group_desc *desc)
|
|
+{
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
+ struct inode *inode = sbi->s_buddy_cache;
|
|
+ int blocks_per_page;
|
|
+ int block;
|
|
+ int pnum;
|
|
+ struct page *page;
|
|
+ int err;
|
|
+
|
|
+ /* Add group based on group descriptor*/
|
|
+ err = ext4_mb_add_groupinfo(sb, group, desc);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ /*
|
|
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
|
|
+ * datas) are set not up to date so that they will be re-initilaized
|
|
+ * during the next call to ext4_mb_load_buddy
|
|
+ */
|
|
+
|
|
+ /* Set buddy page as not up to date */
|
|
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
|
|
+ block = group * 2;
|
|
+ pnum = block / blocks_per_page;
|
|
+ page = find_get_page(inode->i_mapping, pnum);
|
|
+ if (page != NULL) {
|
|
+ ClearPageUptodate(page);
|
|
+ page_cache_release(page);
|
|
+ }
|
|
+
|
|
+ /* Set bitmap page as not up to date */
|
|
+ block++;
|
|
+ pnum = block / blocks_per_page;
|
|
+ page = find_get_page(inode->i_mapping, pnum);
|
|
+ if (page != NULL) {
|
|
+ ClearPageUptodate(page);
|
|
+ page_cache_release(page);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Update an existing group.
|
|
+ * This function is used for online resize
|
|
+ */
|
|
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
|
|
+{
|
|
+ grp->bb_free += add;
|
|
+}
|
|
+
|
|
static int ext4_mb_init_backend(struct super_block *sb)
|
|
{
|
|
ext4_group_t i;
|
|
- int j, len, metalen;
|
|
+ int metalen;
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
- int num_meta_group_infos =
|
|
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
|
|
- EXT4_DESC_PER_BLOCK_BITS(sb);
|
|
+ struct ext4_super_block *es = sbi->s_es;
|
|
+ int num_meta_group_infos;
|
|
+ int num_meta_group_infos_max;
|
|
+ int array_size;
|
|
struct ext4_group_info **meta_group_info;
|
|
+ struct ext4_group_desc *desc;
|
|
|
|
+ /* This is the number of blocks used by GDT */
|
|
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
|
|
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
|
|
+
|
|
+ /*
|
|
+ * This is the total number of blocks used by GDT including
|
|
+ * the number of reserved blocks for GDT.
|
|
+ * The s_group_info array is allocated with this value
|
|
+ * to allow a clean online resize without a complex
|
|
+ * manipulation of pointer.
|
|
+ * The drawback is the unused memory when no resize
|
|
+ * occurs but it's very low in terms of pages
|
|
+ * (see comments below)
|
|
+ * Need to handle this properly when META_BG resizing is allowed
|
|
+ */
|
|
+ num_meta_group_infos_max = num_meta_group_infos +
|
|
+ le16_to_cpu(es->s_reserved_gdt_blocks);
|
|
+
|
|
+ /*
|
|
+ * array_size is the size of s_group_info array. We round it
|
|
+ * to the next power of two because this approximation is done
|
|
+ * internally by kmalloc so we can have some more memory
|
|
+ * for free here (e.g. may be used for META_BG resize).
|
|
+ */
|
|
+ array_size = 1;
|
|
+ while (array_size < sizeof(*sbi->s_group_info) *
|
|
+ num_meta_group_infos_max)
|
|
+ array_size = array_size << 1;
|
|
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
|
|
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
|
|
* So a two level scheme suffices for now. */
|
|
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
|
|
- num_meta_group_infos, GFP_KERNEL);
|
|
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
|
|
if (sbi->s_group_info == NULL) {
|
|
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
|
|
return -ENOMEM;
|
|
@@ -2256,63 +2451,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
|
|
sbi->s_group_info[i] = meta_group_info;
|
|
}
|
|
|
|
- /*
|
|
- * calculate needed size. if change bb_counters size,
|
|
- * don't forget about ext4_mb_generate_buddy()
|
|
- */
|
|
- len = sizeof(struct ext4_group_info);
|
|
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
|
|
for (i = 0; i < sbi->s_groups_count; i++) {
|
|
- struct ext4_group_desc *desc;
|
|
-
|
|
- meta_group_info =
|
|
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
|
|
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
|
|
-
|
|
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
|
|
- if (meta_group_info[j] == NULL) {
|
|
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
|
|
- goto err_freebuddy;
|
|
- }
|
|
desc = ext4_get_group_desc(sb, i, NULL);
|
|
if (desc == NULL) {
|
|
printk(KERN_ERR
|
|
"EXT4-fs: can't read descriptor %lu\n", i);
|
|
- i++;
|
|
goto err_freebuddy;
|
|
}
|
|
- memset(meta_group_info[j], 0, len);
|
|
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
|
|
- &(meta_group_info[j]->bb_state));
|
|
-
|
|
- /*
|
|
- * initialize bb_free to be able to skip
|
|
- * empty groups without initialization
|
|
- */
|
|
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
|
|
- meta_group_info[j]->bb_free =
|
|
- ext4_free_blocks_after_init(sb, i, desc);
|
|
- } else {
|
|
- meta_group_info[j]->bb_free =
|
|
- le16_to_cpu(desc->bg_free_blocks_count);
|
|
- }
|
|
-
|
|
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
|
|
-
|
|
-#ifdef DOUBLE_CHECK
|
|
- {
|
|
- struct buffer_head *bh;
|
|
- meta_group_info[j]->bb_bitmap =
|
|
- kmalloc(sb->s_blocksize, GFP_KERNEL);
|
|
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
|
|
- bh = read_block_bitmap(sb, i);
|
|
- BUG_ON(bh == NULL);
|
|
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
|
|
- sb->s_blocksize);
|
|
- put_bh(bh);
|
|
- }
|
|
-#endif
|
|
-
|
|
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
|
|
+ goto err_freebuddy;
|
|
}
|
|
|
|
return 0;
|
|
@@ -2333,9 +2480,10 @@ err_freesgi:
|
|
int ext4_mb_init(struct super_block *sb, int needs_recovery)
|
|
{
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
- unsigned i;
|
|
+ unsigned i, j;
|
|
unsigned offset;
|
|
unsigned max;
|
|
+ int ret;
|
|
|
|
if (!test_opt(sb, MBALLOC))
|
|
return 0;
|
|
@@ -2370,12 +2518,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
|
|
} while (i <= sb->s_blocksize_bits + 1);
|
|
|
|
/* init file for buddy data */
|
|
- i = ext4_mb_init_backend(sb);
|
|
- if (i) {
|
|
+ ret = ext4_mb_init_backend(sb);
|
|
+ if (ret != 0) {
|
|
clear_opt(sbi->s_mount_opt, MBALLOC);
|
|
kfree(sbi->s_mb_offsets);
|
|
kfree(sbi->s_mb_maxs);
|
|
- return i;
|
|
+ return ret;
|
|
}
|
|
|
|
spin_lock_init(&sbi->s_md_lock);
|
|
@@ -2392,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
|
|
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
|
|
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
|
|
|
|
- i = sizeof(struct ext4_locality_group) * NR_CPUS;
|
|
+ i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
|
|
sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
|
|
if (sbi->s_locality_groups == NULL) {
|
|
clear_opt(sbi->s_mount_opt, MBALLOC);
|
|
@@ -2400,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
|
|
kfree(sbi->s_mb_maxs);
|
|
return -ENOMEM;
|
|
}
|
|
- for (i = 0; i < NR_CPUS; i++) {
|
|
+ for (i = 0; i < nr_cpu_ids; i++) {
|
|
struct ext4_locality_group *lg;
|
|
lg = &sbi->s_locality_groups[i];
|
|
mutex_init(&lg->lg_mutex);
|
|
- INIT_LIST_HEAD(&lg->lg_prealloc_list);
|
|
+ for (j = 0; j < PREALLOC_TB_SIZE; j++)
|
|
+ INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
|
|
spin_lock_init(&lg->lg_prealloc_lock);
|
|
}
|
|
|
|
@@ -2548,8 +2697,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
|
|
ext4_lock_group(sb, md->group);
|
|
for (i = 0; i < md->num; i++) {
|
|
mb_debug(" %u", md->blocks[i]);
|
|
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
|
|
- BUG_ON(err != 0);
|
|
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
|
|
}
|
|
mb_debug("\n");
|
|
ext4_unlock_group(sb, md->group);
|
|
@@ -2575,25 +2723,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
|
|
|
|
|
|
|
|
-#define MB_PROC_VALUE_READ(name) \
|
|
-static int ext4_mb_read_##name(char *page, char **start, \
|
|
- off_t off, int count, int *eof, void *data) \
|
|
+#define MB_PROC_FOPS(name) \
|
|
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
|
|
{ \
|
|
- struct ext4_sb_info *sbi = data; \
|
|
- int len; \
|
|
- *eof = 1; \
|
|
- if (off != 0) \
|
|
- return 0; \
|
|
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
|
|
- *start = page; \
|
|
- return len; \
|
|
-}
|
|
-
|
|
-#define MB_PROC_VALUE_WRITE(name) \
|
|
-static int ext4_mb_write_##name(struct file *file, \
|
|
- const char __user *buf, unsigned long cnt, void *data) \
|
|
+ struct ext4_sb_info *sbi = m->private; \
|
|
+ \
|
|
+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
|
|
+ return 0; \
|
|
+} \
|
|
+ \
|
|
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
|
|
{ \
|
|
- struct ext4_sb_info *sbi = data; \
|
|
+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
|
|
+} \
|
|
+ \
|
|
+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
|
|
+ const char __user *buf, size_t cnt, loff_t *ppos) \
|
|
+{ \
|
|
+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
|
|
char str[32]; \
|
|
long value; \
|
|
if (cnt >= sizeof(str)) \
|
|
@@ -2605,31 +2752,32 @@ static int ext4_mb_write_##name(struct file *file, \
|
|
return -ERANGE; \
|
|
sbi->s_mb_##name = value; \
|
|
return cnt; \
|
|
-}
|
|
+} \
|
|
+ \
|
|
+static const struct file_operations ext4_mb_##name##_proc_fops = { \
|
|
+ .owner = THIS_MODULE, \
|
|
+ .open = ext4_mb_##name##_proc_open, \
|
|
+ .read = seq_read, \
|
|
+ .llseek = seq_lseek, \
|
|
+ .release = single_release, \
|
|
+ .write = ext4_mb_##name##_proc_write, \
|
|
+};
|
|
|
|
-MB_PROC_VALUE_READ(stats);
|
|
-MB_PROC_VALUE_WRITE(stats);
|
|
-MB_PROC_VALUE_READ(max_to_scan);
|
|
-MB_PROC_VALUE_WRITE(max_to_scan);
|
|
-MB_PROC_VALUE_READ(min_to_scan);
|
|
-MB_PROC_VALUE_WRITE(min_to_scan);
|
|
-MB_PROC_VALUE_READ(order2_reqs);
|
|
-MB_PROC_VALUE_WRITE(order2_reqs);
|
|
-MB_PROC_VALUE_READ(stream_request);
|
|
-MB_PROC_VALUE_WRITE(stream_request);
|
|
-MB_PROC_VALUE_READ(group_prealloc);
|
|
-MB_PROC_VALUE_WRITE(group_prealloc);
|
|
+MB_PROC_FOPS(stats);
|
|
+MB_PROC_FOPS(max_to_scan);
|
|
+MB_PROC_FOPS(min_to_scan);
|
|
+MB_PROC_FOPS(order2_reqs);
|
|
+MB_PROC_FOPS(stream_request);
|
|
+MB_PROC_FOPS(group_prealloc);
|
|
|
|
#define MB_PROC_HANDLER(name, var) \
|
|
do { \
|
|
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
|
|
+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
|
|
+ &ext4_mb_##var##_proc_fops, sbi); \
|
|
if (proc == NULL) { \
|
|
printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
|
|
goto err_out; \
|
|
} \
|
|
- proc->data = sbi; \
|
|
- proc->read_proc = ext4_mb_read_##var ; \
|
|
- proc->write_proc = ext4_mb_write_##var; \
|
|
} while (0)
|
|
|
|
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
|
|
@@ -2639,6 +2787,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
|
|
struct proc_dir_entry *proc;
|
|
char devname[64];
|
|
|
|
+ if (proc_root_ext4 == NULL) {
|
|
+ sbi->s_mb_proc = NULL;
|
|
+ return -EINVAL;
|
|
+ }
|
|
bdevname(sb->s_bdev, devname);
|
|
sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
|
|
|
|
@@ -2747,7 +2899,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
|
|
|
|
|
|
err = -EIO;
|
|
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
|
|
if (!bitmap_bh)
|
|
goto out_err;
|
|
|
|
@@ -2816,7 +2968,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
|
|
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
|
|
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
|
|
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
|
|
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
|
|
+
|
|
+ /*
|
|
+ * free blocks account has already be reduced/reserved
|
|
+ * at write_begin() time for delayed allocation
|
|
+ * do not double accounting
|
|
+ */
|
|
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
|
|
+ percpu_counter_sub(&sbi->s_freeblocks_counter,
|
|
+ ac->ac_b_ex.fe_len);
|
|
+
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ ext4_group_t flex_group = ext4_flex_group(sbi,
|
|
+ ac->ac_b_ex.fe_group);
|
|
+ spin_lock(sb_bgl_lock(sbi, flex_group));
|
|
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
|
|
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
|
|
+ }
|
|
|
|
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
|
|
if (err)
|
|
@@ -3096,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
|
|
struct ext4_prealloc_space *pa)
|
|
{
|
|
unsigned int len = ac->ac_o_ex.fe_len;
|
|
+
|
|
ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
|
|
&ac->ac_b_ex.fe_group,
|
|
&ac->ac_b_ex.fe_start);
|
|
@@ -3113,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
|
|
}
|
|
|
|
/*
|
|
+ * Return the prealloc space that have minimal distance
|
|
+ * from the goal block. @cpa is the prealloc
|
|
+ * space that is having currently known minimal distance
|
|
+ * from the goal block.
|
|
+ */
|
|
+static struct ext4_prealloc_space *
|
|
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
|
|
+ struct ext4_prealloc_space *pa,
|
|
+ struct ext4_prealloc_space *cpa)
|
|
+{
|
|
+ ext4_fsblk_t cur_distance, new_distance;
|
|
+
|
|
+ if (cpa == NULL) {
|
|
+ atomic_inc(&pa->pa_count);
|
|
+ return pa;
|
|
+ }
|
|
+ cur_distance = abs(goal_block - cpa->pa_pstart);
|
|
+ new_distance = abs(goal_block - pa->pa_pstart);
|
|
+
|
|
+ if (cur_distance < new_distance)
|
|
+ return cpa;
|
|
+
|
|
+ /* drop the previous reference */
|
|
+ atomic_dec(&cpa->pa_count);
|
|
+ atomic_inc(&pa->pa_count);
|
|
+ return pa;
|
|
+}
|
|
+
|
|
+/*
|
|
* search goal blocks in preallocated space
|
|
*/
|
|
static noinline_for_stack int
|
|
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
|
|
{
|
|
+ int order, i;
|
|
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
|
|
struct ext4_locality_group *lg;
|
|
- struct ext4_prealloc_space *pa;
|
|
+ struct ext4_prealloc_space *pa, *cpa = NULL;
|
|
+ ext4_fsblk_t goal_block;
|
|
|
|
/* only data can be preallocated */
|
|
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
|
|
@@ -3158,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
|
|
lg = ac->ac_lg;
|
|
if (lg == NULL)
|
|
return 0;
|
|
+ order = fls(ac->ac_o_ex.fe_len) - 1;
|
|
+ if (order > PREALLOC_TB_SIZE - 1)
|
|
+ /* The max size of hash table is PREALLOC_TB_SIZE */
|
|
+ order = PREALLOC_TB_SIZE - 1;
|
|
+
|
|
+ goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
|
|
+ ac->ac_g_ex.fe_start +
|
|
+ le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
|
|
+ /*
|
|
+ * search for the prealloc space that is having
|
|
+ * minimal distance from the goal block.
|
|
+ */
|
|
+ for (i = order; i < PREALLOC_TB_SIZE; i++) {
|
|
+ rcu_read_lock();
|
|
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
|
|
+ pa_inode_list) {
|
|
+ spin_lock(&pa->pa_lock);
|
|
+ if (pa->pa_deleted == 0 &&
|
|
+ pa->pa_free >= ac->ac_o_ex.fe_len) {
|
|
|
|
- rcu_read_lock();
|
|
- list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
|
|
- spin_lock(&pa->pa_lock);
|
|
- if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
|
|
- atomic_inc(&pa->pa_count);
|
|
- ext4_mb_use_group_pa(ac, pa);
|
|
+ cpa = ext4_mb_check_group_pa(goal_block,
|
|
+ pa, cpa);
|
|
+ }
|
|
spin_unlock(&pa->pa_lock);
|
|
- ac->ac_criteria = 20;
|
|
- rcu_read_unlock();
|
|
- return 1;
|
|
}
|
|
- spin_unlock(&pa->pa_lock);
|
|
+ rcu_read_unlock();
|
|
+ }
|
|
+ if (cpa) {
|
|
+ ext4_mb_use_group_pa(ac, cpa);
|
|
+ ac->ac_criteria = 20;
|
|
+ return 1;
|
|
}
|
|
- rcu_read_unlock();
|
|
-
|
|
return 0;
|
|
}
|
|
|
|
@@ -3396,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
|
pa->pa_free = pa->pa_len;
|
|
atomic_set(&pa->pa_count, 1);
|
|
spin_lock_init(&pa->pa_lock);
|
|
+ INIT_LIST_HEAD(&pa->pa_inode_list);
|
|
pa->pa_deleted = 0;
|
|
pa->pa_linear = 1;
|
|
|
|
@@ -3416,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
|
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
|
|
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
|
|
|
|
- spin_lock(pa->pa_obj_lock);
|
|
- list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
|
|
- spin_unlock(pa->pa_obj_lock);
|
|
-
|
|
+ /*
|
|
+ * We will later add the new pa to the right bucket
|
|
+ * after updating the pa_free in ext4_mb_release_context
|
|
+ */
|
|
return 0;
|
|
}
|
|
|
|
@@ -3473,8 +3690,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
|
|
if (bit >= end)
|
|
break;
|
|
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
|
|
- if (next > end)
|
|
- next = end;
|
|
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
|
|
le32_to_cpu(sbi->s_es->s_first_data_block);
|
|
mb_debug(" free preallocated %u/%u in group %u\n",
|
|
@@ -3569,22 +3784,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
|
|
if (list_empty(&grp->bb_prealloc_list))
|
|
return 0;
|
|
|
|
- bitmap_bh = read_block_bitmap(sb, group);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
|
|
if (bitmap_bh == NULL) {
|
|
- /* error handling here */
|
|
- ext4_mb_release_desc(&e4b);
|
|
- BUG_ON(bitmap_bh == NULL);
|
|
+ ext4_error(sb, __func__, "Error in reading block "
|
|
+ "bitmap for %lu\n", group);
|
|
+ return 0;
|
|
}
|
|
|
|
err = ext4_mb_load_buddy(sb, group, &e4b);
|
|
- BUG_ON(err != 0); /* error handling here */
|
|
+ if (err) {
|
|
+ ext4_error(sb, __func__, "Error in loading buddy "
|
|
+ "information for %lu\n", group);
|
|
+ put_bh(bitmap_bh);
|
|
+ return 0;
|
|
+ }
|
|
|
|
if (needed == 0)
|
|
needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
|
|
|
|
- grp = ext4_get_group_info(sb, group);
|
|
INIT_LIST_HEAD(&list);
|
|
-
|
|
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
|
|
repeat:
|
|
ext4_lock_group(sb, group);
|
|
@@ -3741,13 +3959,18 @@ repeat:
|
|
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
|
|
|
|
err = ext4_mb_load_buddy(sb, group, &e4b);
|
|
- BUG_ON(err != 0); /* error handling here */
|
|
+ if (err) {
|
|
+ ext4_error(sb, __func__, "Error in loading buddy "
|
|
+ "information for %lu\n", group);
|
|
+ continue;
|
|
+ }
|
|
|
|
- bitmap_bh = read_block_bitmap(sb, group);
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
|
|
if (bitmap_bh == NULL) {
|
|
- /* error handling here */
|
|
+ ext4_error(sb, __func__, "Error in reading block "
|
|
+ "bitmap for %lu\n", group);
|
|
ext4_mb_release_desc(&e4b);
|
|
- BUG_ON(bitmap_bh == NULL);
|
|
+ continue;
|
|
}
|
|
|
|
ext4_lock_group(sb, group);
|
|
@@ -3950,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
|
|
|
|
}
|
|
|
|
+static noinline_for_stack void
|
|
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
|
|
+ struct ext4_locality_group *lg,
|
|
+ int order, int total_entries)
|
|
+{
|
|
+ ext4_group_t group = 0;
|
|
+ struct ext4_buddy e4b;
|
|
+ struct list_head discard_list;
|
|
+ struct ext4_prealloc_space *pa, *tmp;
|
|
+ struct ext4_allocation_context *ac;
|
|
+
|
|
+ mb_debug("discard locality group preallocation\n");
|
|
+
|
|
+ INIT_LIST_HEAD(&discard_list);
|
|
+ ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
|
|
+
|
|
+ spin_lock(&lg->lg_prealloc_lock);
|
|
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
|
|
+ pa_inode_list) {
|
|
+ spin_lock(&pa->pa_lock);
|
|
+ if (atomic_read(&pa->pa_count)) {
|
|
+ /*
|
|
+ * This is the pa that we just used
|
|
+ * for block allocation. So don't
|
|
+ * free that
|
|
+ */
|
|
+ spin_unlock(&pa->pa_lock);
|
|
+ continue;
|
|
+ }
|
|
+ if (pa->pa_deleted) {
|
|
+ spin_unlock(&pa->pa_lock);
|
|
+ continue;
|
|
+ }
|
|
+ /* only lg prealloc space */
|
|
+ BUG_ON(!pa->pa_linear);
|
|
+
|
|
+ /* seems this one can be freed ... */
|
|
+ pa->pa_deleted = 1;
|
|
+ spin_unlock(&pa->pa_lock);
|
|
+
|
|
+ list_del_rcu(&pa->pa_inode_list);
|
|
+ list_add(&pa->u.pa_tmp_list, &discard_list);
|
|
+
|
|
+ total_entries--;
|
|
+ if (total_entries <= 5) {
|
|
+ /*
|
|
+ * we want to keep only 5 entries
|
|
+ * allowing it to grow to 8. This
|
|
+ * mak sure we don't call discard
|
|
+ * soon for this list.
|
|
+ */
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ spin_unlock(&lg->lg_prealloc_lock);
|
|
+
|
|
+ list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
|
|
+
|
|
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
|
|
+ if (ext4_mb_load_buddy(sb, group, &e4b)) {
|
|
+ ext4_error(sb, __func__, "Error in loading buddy "
|
|
+ "information for %lu\n", group);
|
|
+ continue;
|
|
+ }
|
|
+ ext4_lock_group(sb, group);
|
|
+ list_del(&pa->pa_group_list);
|
|
+ ext4_mb_release_group_pa(&e4b, pa, ac);
|
|
+ ext4_unlock_group(sb, group);
|
|
+
|
|
+ ext4_mb_release_desc(&e4b);
|
|
+ list_del(&pa->u.pa_tmp_list);
|
|
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
|
|
+ }
|
|
+ if (ac)
|
|
+ kmem_cache_free(ext4_ac_cachep, ac);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We have incremented pa_count. So it cannot be freed at this
|
|
+ * point. Also we hold lg_mutex. So no parallel allocation is
|
|
+ * possible from this lg. That means pa_free cannot be updated.
|
|
+ *
|
|
+ * A parallel ext4_mb_discard_group_preallocations is possible.
|
|
+ * which can cause the lg_prealloc_list to be updated.
|
|
+ */
|
|
+
|
|
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
|
|
+{
|
|
+ int order, added = 0, lg_prealloc_count = 1;
|
|
+ struct super_block *sb = ac->ac_sb;
|
|
+ struct ext4_locality_group *lg = ac->ac_lg;
|
|
+ struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
|
|
+
|
|
+ order = fls(pa->pa_free) - 1;
|
|
+ if (order > PREALLOC_TB_SIZE - 1)
|
|
+ /* The max size of hash table is PREALLOC_TB_SIZE */
|
|
+ order = PREALLOC_TB_SIZE - 1;
|
|
+ /* Add the prealloc space to lg */
|
|
+ rcu_read_lock();
|
|
+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
|
|
+ pa_inode_list) {
|
|
+ spin_lock(&tmp_pa->pa_lock);
|
|
+ if (tmp_pa->pa_deleted) {
|
|
+ spin_unlock(&pa->pa_lock);
|
|
+ continue;
|
|
+ }
|
|
+ if (!added && pa->pa_free < tmp_pa->pa_free) {
|
|
+ /* Add to the tail of the previous entry */
|
|
+ list_add_tail_rcu(&pa->pa_inode_list,
|
|
+ &tmp_pa->pa_inode_list);
|
|
+ added = 1;
|
|
+ /*
|
|
+ * we want to count the total
|
|
+ * number of entries in the list
|
|
+ */
|
|
+ }
|
|
+ spin_unlock(&tmp_pa->pa_lock);
|
|
+ lg_prealloc_count++;
|
|
+ }
|
|
+ if (!added)
|
|
+ list_add_tail_rcu(&pa->pa_inode_list,
|
|
+ &lg->lg_prealloc_list[order]);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ /* Now trim the list to be not more than 8 elements */
|
|
+ if (lg_prealloc_count > 8) {
|
|
+ ext4_mb_discard_lg_preallocations(sb, lg,
|
|
+ order, lg_prealloc_count);
|
|
+ return;
|
|
+ }
|
|
+ return ;
|
|
+}
|
|
+
|
|
/*
|
|
* release all resource we used in allocation
|
|
*/
|
|
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
|
|
{
|
|
- if (ac->ac_pa) {
|
|
- if (ac->ac_pa->pa_linear) {
|
|
+ struct ext4_prealloc_space *pa = ac->ac_pa;
|
|
+ if (pa) {
|
|
+ if (pa->pa_linear) {
|
|
/* see comment in ext4_mb_use_group_pa() */
|
|
- spin_lock(&ac->ac_pa->pa_lock);
|
|
- ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
|
|
- ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
|
|
- ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
|
|
- ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
|
|
- spin_unlock(&ac->ac_pa->pa_lock);
|
|
+ spin_lock(&pa->pa_lock);
|
|
+ pa->pa_pstart += ac->ac_b_ex.fe_len;
|
|
+ pa->pa_lstart += ac->ac_b_ex.fe_len;
|
|
+ pa->pa_free -= ac->ac_b_ex.fe_len;
|
|
+ pa->pa_len -= ac->ac_b_ex.fe_len;
|
|
+ spin_unlock(&pa->pa_lock);
|
|
+ /*
|
|
+ * We want to add the pa to the right bucket.
|
|
+ * Remove it from the list and while adding
|
|
+ * make sure the list to which we are adding
|
|
+ * doesn't grow big.
|
|
+ */
|
|
+ if (likely(pa->pa_free)) {
|
|
+ spin_lock(pa->pa_obj_lock);
|
|
+ list_del_rcu(&pa->pa_inode_list);
|
|
+ spin_unlock(pa->pa_obj_lock);
|
|
+ ext4_mb_add_n_trim(ac);
|
|
+ }
|
|
}
|
|
- ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
|
|
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
|
|
}
|
|
if (ac->ac_bitmap_page)
|
|
page_cache_release(ac->ac_bitmap_page);
|
|
@@ -4011,10 +4380,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
|
|
sbi = EXT4_SB(sb);
|
|
|
|
if (!test_opt(sb, MBALLOC)) {
|
|
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
|
|
+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
|
|
&(ar->len), errp);
|
|
return block;
|
|
}
|
|
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
|
|
+ /*
|
|
+ * With delalloc we already reserved the blocks
|
|
+ */
|
|
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
|
|
+ }
|
|
+
|
|
+ if (ar->len == 0) {
|
|
+ *errp = -ENOSPC;
|
|
+ return 0;
|
|
+ }
|
|
|
|
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
|
|
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
|
|
@@ -4026,10 +4406,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
|
|
}
|
|
inquota = ar->len;
|
|
|
|
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
|
|
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
|
|
+
|
|
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
|
|
if (!ac) {
|
|
+ ar->len = 0;
|
|
*errp = -ENOMEM;
|
|
- return 0;
|
|
+ goto out1;
|
|
}
|
|
|
|
ext4_mb_poll_new_transaction(sb, handle);
|
|
@@ -4037,12 +4421,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
|
|
*errp = ext4_mb_initialize_context(ac, ar);
|
|
if (*errp) {
|
|
ar->len = 0;
|
|
- goto out;
|
|
+ goto out2;
|
|
}
|
|
|
|
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
|
|
if (!ext4_mb_use_preallocated(ac)) {
|
|
-
|
|
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
|
|
ext4_mb_normalize_request(ac, ar);
|
|
repeat:
|
|
@@ -4085,11 +4468,12 @@ repeat:
|
|
|
|
ext4_mb_release_context(ac);
|
|
|
|
-out:
|
|
+out2:
|
|
+ kmem_cache_free(ext4_ac_cachep, ac);
|
|
+out1:
|
|
if (ar->len < inquota)
|
|
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
|
|
|
|
- kmem_cache_free(ext4_ac_cachep, ac);
|
|
return block;
|
|
}
|
|
static void ext4_mb_poll_new_transaction(struct super_block *sb,
|
|
@@ -4242,12 +4626,16 @@ do_more:
|
|
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
|
|
count -= overflow;
|
|
}
|
|
- bitmap_bh = read_block_bitmap(sb, block_group);
|
|
- if (!bitmap_bh)
|
|
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
|
|
+ if (!bitmap_bh) {
|
|
+ err = -EIO;
|
|
goto error_return;
|
|
+ }
|
|
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
|
|
- if (!gdp)
|
|
+ if (!gdp) {
|
|
+ err = -EIO;
|
|
goto error_return;
|
|
+ }
|
|
|
|
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
|
|
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
|
|
@@ -4309,10 +4697,9 @@ do_more:
|
|
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
|
|
} else {
|
|
ext4_lock_group(sb, block_group);
|
|
- err = mb_free_blocks(inode, &e4b, bit, count);
|
|
+ mb_free_blocks(inode, &e4b, bit, count);
|
|
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
|
|
ext4_unlock_group(sb, block_group);
|
|
- BUG_ON(err != 0);
|
|
}
|
|
|
|
spin_lock(sb_bgl_lock(sbi, block_group));
|
|
@@ -4321,6 +4708,13 @@ do_more:
|
|
spin_unlock(sb_bgl_lock(sbi, block_group));
|
|
percpu_counter_add(&sbi->s_freeblocks_counter, count);
|
|
|
|
+ if (sbi->s_log_groups_per_flex) {
|
|
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
|
|
+ spin_lock(sb_bgl_lock(sbi, flex_group));
|
|
+ sbi->s_flex_groups[flex_group].free_blocks += count;
|
|
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
|
|
+ }
|
|
+
|
|
ext4_mb_release_desc(&e4b);
|
|
|
|
*freed += count;
|
|
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
|
|
index bfe6add..c7c9906 100644
|
|
--- a/fs/ext4/mballoc.h
|
|
+++ b/fs/ext4/mballoc.h
|
|
@@ -164,11 +164,17 @@ struct ext4_free_extent {
|
|
* Locality group:
|
|
* we try to group all related changes together
|
|
* so that writeback can flush/allocate them together as well
|
|
+ * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
|
|
+ * (512). We store prealloc space into the hash based on the pa_free blocks
|
|
+ * order value.ie, fls(pa_free)-1;
|
|
*/
|
|
+#define PREALLOC_TB_SIZE 10
|
|
struct ext4_locality_group {
|
|
/* for allocator */
|
|
- struct mutex lg_mutex; /* to serialize allocates */
|
|
- struct list_head lg_prealloc_list;/* list of preallocations */
|
|
+ /* to serialize allocates */
|
|
+ struct mutex lg_mutex;
|
|
+ /* list of preallocations */
|
|
+ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
|
|
spinlock_t lg_prealloc_lock;
|
|
};
|
|
|
|
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
|
|
index b9e077b..46fc0b5 100644
|
|
--- a/fs/ext4/migrate.c
|
|
+++ b/fs/ext4/migrate.c
|
|
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
|
|
* credit. But below we try to not accumalate too much
|
|
* of them by restarting the journal.
|
|
*/
|
|
- needed = ext4_ext_calc_credits_for_insert(inode, path);
|
|
+ needed = ext4_ext_calc_credits_for_single_extent(inode,
|
|
+ lb->last_block - lb->first_block + 1, path);
|
|
|
|
/*
|
|
* Make sure the credit we accumalated is not really high
|
|
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
|
|
index ab16bea..387ad98 100644
|
|
--- a/fs/ext4/namei.c
|
|
+++ b/fs/ext4/namei.c
|
|
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
|
|
struct inode *inode);
|
|
|
|
/*
|
|
+ * p is at least 6 bytes before the end of page
|
|
+ */
|
|
+static inline struct ext4_dir_entry_2 *
|
|
+ext4_next_entry(struct ext4_dir_entry_2 *p)
|
|
+{
|
|
+ return (struct ext4_dir_entry_2 *)((char *)p +
|
|
+ ext4_rec_len_from_disk(p->rec_len));
|
|
+}
|
|
+
|
|
+/*
|
|
* Future: use high four bits of block for coalesce-on-delete flags
|
|
* Mask them off for now.
|
|
*/
|
|
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
|
|
{
|
|
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
|
|
EXT4_DIR_REC_LEN(2) - infosize;
|
|
- return 0? 20: entry_space / sizeof(struct dx_entry);
|
|
+ return entry_space / sizeof(struct dx_entry);
|
|
}
|
|
|
|
static inline unsigned dx_node_limit (struct inode *dir)
|
|
{
|
|
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
|
|
- return 0? 22: entry_space / sizeof(struct dx_entry);
|
|
+ return entry_space / sizeof(struct dx_entry);
|
|
}
|
|
|
|
/*
|
|
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
|
|
|
|
|
|
/*
|
|
- * p is at least 6 bytes before the end of page
|
|
- */
|
|
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
|
|
-{
|
|
- return (struct ext4_dir_entry_2 *)((char *)p +
|
|
- ext4_rec_len_from_disk(p->rec_len));
|
|
-}
|
|
-
|
|
-/*
|
|
* This function fills a red-black tree with information from a
|
|
* directory block. It returns the number directory entries loaded
|
|
* into the tree. If there is an error it is returned in err.
|
|
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
|
|
de = (struct ext4_dir_entry_2 *) bh->b_data;
|
|
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
|
|
EXT4_DIR_REC_LEN(0));
|
|
- for (; de < top; de = ext4_next_entry(de))
|
|
- if (ext4_match (namelen, name, de)) {
|
|
- if (!ext4_check_dir_entry("ext4_find_entry",
|
|
- dir, de, bh,
|
|
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
|
|
- +((char *)de - bh->b_data))) {
|
|
- brelse (bh);
|
|
+ for (; de < top; de = ext4_next_entry(de)) {
|
|
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
|
|
+ + ((char *) de - bh->b_data);
|
|
+
|
|
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
|
|
+ brelse(bh);
|
|
*err = ERR_BAD_DX_DIR;
|
|
goto errout;
|
|
}
|
|
- *res_dir = de;
|
|
- dx_release (frames);
|
|
- return bh;
|
|
+
|
|
+ if (ext4_match(namelen, name, de)) {
|
|
+ *res_dir = de;
|
|
+ dx_release(frames);
|
|
+ return bh;
|
|
+ }
|
|
}
|
|
brelse (bh);
|
|
/* Check to see if we should continue to search */
|
|
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
|
|
index 9ff7b1c..b3d3560 100644
|
|
--- a/fs/ext4/resize.c
|
|
+++ b/fs/ext4/resize.c
|
|
@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
|
|
"Inode bitmap not in group (block %llu)",
|
|
(unsigned long long)input->inode_bitmap);
|
|
else if (outside(input->inode_table, start, end) ||
|
|
- outside(itend - 1, start, end))
|
|
+ outside(itend - 1, start, end))
|
|
ext4_warning(sb, __func__,
|
|
"Inode table not in group (blocks %llu-%llu)",
|
|
(unsigned long long)input->inode_table, itend - 1);
|
|
@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
|
|
(unsigned long long)input->inode_bitmap,
|
|
start, metaend - 1);
|
|
else if (inside(input->inode_table, start, metaend) ||
|
|
- inside(itend - 1, start, metaend))
|
|
+ inside(itend - 1, start, metaend))
|
|
ext4_warning(sb, __func__,
|
|
"Inode table (%llu-%llu) overlaps"
|
|
"GDT table (%llu-%llu)",
|
|
@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
|
|
if (err) {
|
|
if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
|
|
return err;
|
|
- if ((err = ext4_journal_get_write_access(handle, bh)))
|
|
+ if ((err = ext4_journal_get_write_access(handle, bh)))
|
|
return err;
|
|
- }
|
|
+ }
|
|
|
|
return 0;
|
|
}
|
|
@@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
|
|
"EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
|
|
gdb_num);
|
|
|
|
- /*
|
|
- * If we are not using the primary superblock/GDT copy don't resize,
|
|
- * because the user tools have no way of handling this. Probably a
|
|
- * bad time to do it anyways.
|
|
- */
|
|
+ /*
|
|
+ * If we are not using the primary superblock/GDT copy don't resize,
|
|
+ * because the user tools have no way of handling this. Probably a
|
|
+ * bad time to do it anyways.
|
|
+ */
|
|
if (EXT4_SB(sb)->s_sbh->b_blocknr !=
|
|
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
|
|
ext4_warning(sb, __func__,
|
|
@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
|
|
return 0;
|
|
|
|
exit_inode:
|
|
- //ext4_journal_release_buffer(handle, iloc.bh);
|
|
+ /* ext4_journal_release_buffer(handle, iloc.bh); */
|
|
brelse(iloc.bh);
|
|
exit_dindj:
|
|
- //ext4_journal_release_buffer(handle, dind);
|
|
+ /* ext4_journal_release_buffer(handle, dind); */
|
|
exit_primary:
|
|
- //ext4_journal_release_buffer(handle, *primary);
|
|
+ /* ext4_journal_release_buffer(handle, *primary); */
|
|
exit_sbh:
|
|
- //ext4_journal_release_buffer(handle, *primary);
|
|
+ /* ext4_journal_release_buffer(handle, *primary); */
|
|
exit_dind:
|
|
brelse(dind);
|
|
exit_bh:
|
|
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
|
|
|
|
if (reserved_gdb || gdb_off == 0) {
|
|
if (!EXT4_HAS_COMPAT_FEATURE(sb,
|
|
- EXT4_FEATURE_COMPAT_RESIZE_INODE)){
|
|
+ EXT4_FEATURE_COMPAT_RESIZE_INODE)
|
|
+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
|
|
ext4_warning(sb, __func__,
|
|
"No reserved GDT blocks, can't resize");
|
|
return -EPERM;
|
|
@@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
|
|
if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
|
|
goto exit_journal;
|
|
|
|
- /*
|
|
- * We will only either add reserved group blocks to a backup group
|
|
- * or remove reserved blocks for the first group in a new group block.
|
|
- * Doing both would be mean more complex code, and sane people don't
|
|
- * use non-sparse filesystems anymore. This is already checked above.
|
|
- */
|
|
+ /*
|
|
+ * We will only either add reserved group blocks to a backup group
|
|
+ * or remove reserved blocks for the first group in a new group block.
|
|
+ * Doing both would be mean more complex code, and sane people don't
|
|
+ * use non-sparse filesystems anymore. This is already checked above.
|
|
+ */
|
|
if (gdb_off) {
|
|
primary = sbi->s_group_desc[gdb_num];
|
|
if ((err = ext4_journal_get_write_access(handle, primary)))
|
|
@@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
|
|
} else if ((err = add_new_gdb(handle, inode, input, &primary)))
|
|
goto exit_journal;
|
|
|
|
- /*
|
|
- * OK, now we've set up the new group. Time to make it active.
|
|
- *
|
|
- * Current kernels don't lock all allocations via lock_super(),
|
|
- * so we have to be safe wrt. concurrent accesses the group
|
|
- * data. So we need to be careful to set all of the relevant
|
|
- * group descriptor data etc. *before* we enable the group.
|
|
- *
|
|
- * The key field here is sbi->s_groups_count: as long as
|
|
- * that retains its old value, nobody is going to access the new
|
|
- * group.
|
|
- *
|
|
- * So first we update all the descriptor metadata for the new
|
|
- * group; then we update the total disk blocks count; then we
|
|
- * update the groups count to enable the group; then finally we
|
|
- * update the free space counts so that the system can start
|
|
- * using the new disk blocks.
|
|
- */
|
|
+ /*
|
|
+ * OK, now we've set up the new group. Time to make it active.
|
|
+ *
|
|
+ * Current kernels don't lock all allocations via lock_super(),
|
|
+ * so we have to be safe wrt. concurrent accesses the group
|
|
+ * data. So we need to be careful to set all of the relevant
|
|
+ * group descriptor data etc. *before* we enable the group.
|
|
+ *
|
|
+ * The key field here is sbi->s_groups_count: as long as
|
|
+ * that retains its old value, nobody is going to access the new
|
|
+ * group.
|
|
+ *
|
|
+ * So first we update all the descriptor metadata for the new
|
|
+ * group; then we update the total disk blocks count; then we
|
|
+ * update the groups count to enable the group; then finally we
|
|
+ * update the free space counts so that the system can start
|
|
+ * using the new disk blocks.
|
|
+ */
|
|
|
|
/* Update group descriptor block for new group */
|
|
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
|
|
@@ -866,6 +867,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
|
|
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
|
|
|
|
/*
|
|
+ * We can allocate memory for mb_alloc based on the new group
|
|
+ * descriptor
|
|
+ */
|
|
+ if (test_opt(sb, MBALLOC)) {
|
|
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
|
|
+ if (err)
|
|
+ goto exit_journal;
|
|
+ }
|
|
+ /*
|
|
* Make the new blocks and inodes valid next. We do this before
|
|
* increasing the group count so that once the group is enabled,
|
|
* all of its blocks and inodes are already valid.
|
|
@@ -937,7 +947,8 @@ exit_put:
|
|
return err;
|
|
} /* ext4_group_add */
|
|
|
|
-/* Extend the filesystem to the new number of blocks specified. This entry
|
|
+/*
|
|
+ * Extend the filesystem to the new number of blocks specified. This entry
|
|
* point is only used to extend the current filesystem to the end of the last
|
|
* existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
|
|
* for emergencies (because it has no dependencies on reserved blocks).
|
|
@@ -957,6 +968,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
|
|
handle_t *handle;
|
|
int err;
|
|
unsigned long freed_blocks;
|
|
+ ext4_group_t group;
|
|
+ struct ext4_group_info *grp;
|
|
|
|
/* We don't need to worry about locking wrt other resizers just
|
|
* yet: we're going to revalidate es->s_blocks_count after
|
|
@@ -988,7 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
|
|
}
|
|
|
|
/* Handle the remaining blocks in the last group only. */
|
|
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
|
|
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
|
|
|
|
if (last == 0) {
|
|
ext4_warning(sb, __func__,
|
|
@@ -1013,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
|
|
o_blocks_count + add, add);
|
|
|
|
/* See if the device is actually as big as what was requested */
|
|
- bh = sb_bread(sb, o_blocks_count + add -1);
|
|
+ bh = sb_bread(sb, o_blocks_count + add - 1);
|
|
if (!bh) {
|
|
ext4_warning(sb, __func__,
|
|
"can't read last block, resize aborted");
|
|
@@ -1060,6 +1073,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
|
|
o_blocks_count + add);
|
|
if ((err = ext4_journal_stop(handle)))
|
|
goto exit_put;
|
|
+
|
|
+ /*
|
|
+ * Mark mballoc pages as not up to date so that they will be updated
|
|
+ * next time they are loaded by ext4_mb_load_buddy.
|
|
+ */
|
|
+ if (test_opt(sb, MBALLOC)) {
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
+ struct inode *inode = sbi->s_buddy_cache;
|
|
+ int blocks_per_page;
|
|
+ int block;
|
|
+ int pnum;
|
|
+ struct page *page;
|
|
+
|
|
+ /* Set buddy page as not up to date */
|
|
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
|
|
+ block = group * 2;
|
|
+ pnum = block / blocks_per_page;
|
|
+ page = find_get_page(inode->i_mapping, pnum);
|
|
+ if (page != NULL) {
|
|
+ ClearPageUptodate(page);
|
|
+ page_cache_release(page);
|
|
+ }
|
|
+
|
|
+ /* Set bitmap page as not up to date */
|
|
+ block++;
|
|
+ pnum = block / blocks_per_page;
|
|
+ page = find_get_page(inode->i_mapping, pnum);
|
|
+ if (page != NULL) {
|
|
+ ClearPageUptodate(page);
|
|
+ page_cache_release(page);
|
|
+ }
|
|
+
|
|
+ /* Get the info on the last group */
|
|
+ grp = ext4_get_group_info(sb, group);
|
|
+
|
|
+ /* Update free blocks in group info */
|
|
+ ext4_mb_update_group_info(grp, add);
|
|
+ }
|
|
+
|
|
if (test_opt(sb, DEBUG))
|
|
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
|
|
ext4_blocks_count(es));
|
|
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
|
|
index 02bf243..ed80f9f 100644
|
|
--- a/fs/ext4/super.c
|
|
+++ b/fs/ext4/super.c
|
|
@@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
|
|
unsigned long journal_devnum);
|
|
static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
|
|
unsigned int);
|
|
-static void ext4_commit_super (struct super_block * sb,
|
|
- struct ext4_super_block * es,
|
|
- int sync);
|
|
-static void ext4_mark_recovery_complete(struct super_block * sb,
|
|
- struct ext4_super_block * es);
|
|
-static void ext4_clear_journal_err(struct super_block * sb,
|
|
- struct ext4_super_block * es);
|
|
+static void ext4_commit_super(struct super_block *sb,
|
|
+ struct ext4_super_block *es, int sync);
|
|
+static void ext4_mark_recovery_complete(struct super_block *sb,
|
|
+ struct ext4_super_block *es);
|
|
+static void ext4_clear_journal_err(struct super_block *sb,
|
|
+ struct ext4_super_block *es);
|
|
static int ext4_sync_fs(struct super_block *sb, int wait);
|
|
-static const char *ext4_decode_error(struct super_block * sb, int errno,
|
|
+static const char *ext4_decode_error(struct super_block *sb, int errno,
|
|
char nbuf[16]);
|
|
-static int ext4_remount (struct super_block * sb, int * flags, char * data);
|
|
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
|
|
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
|
|
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
|
|
static void ext4_unlockfs(struct super_block *sb);
|
|
-static void ext4_write_super (struct super_block * sb);
|
|
+static void ext4_write_super(struct super_block *sb);
|
|
static void ext4_write_super_lockfs(struct super_block *sb);
|
|
|
|
|
|
@@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb)
|
|
if (sb->s_flags & MS_RDONLY)
|
|
return;
|
|
|
|
- if (!test_opt (sb, ERRORS_CONT)) {
|
|
+ if (!test_opt(sb, ERRORS_CONT)) {
|
|
journal_t *journal = EXT4_SB(sb)->s_journal;
|
|
|
|
EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
|
|
if (journal)
|
|
jbd2_journal_abort(journal, -EIO);
|
|
}
|
|
- if (test_opt (sb, ERRORS_RO)) {
|
|
- printk (KERN_CRIT "Remounting filesystem read-only\n");
|
|
+ if (test_opt(sb, ERRORS_RO)) {
|
|
+ printk(KERN_CRIT "Remounting filesystem read-only\n");
|
|
sb->s_flags |= MS_RDONLY;
|
|
}
|
|
ext4_commit_super(sb, es, 1);
|
|
@@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb)
|
|
sb->s_id);
|
|
}
|
|
|
|
-void ext4_error (struct super_block * sb, const char * function,
|
|
- const char * fmt, ...)
|
|
+void ext4_error(struct super_block *sb, const char *function,
|
|
+ const char *fmt, ...)
|
|
{
|
|
va_list args;
|
|
|
|
va_start(args, fmt);
|
|
- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
|
|
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
|
|
vprintk(fmt, args);
|
|
printk("\n");
|
|
va_end(args);
|
|
@@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function,
|
|
ext4_handle_error(sb);
|
|
}
|
|
|
|
-static const char *ext4_decode_error(struct super_block * sb, int errno,
|
|
+static const char *ext4_decode_error(struct super_block *sb, int errno,
|
|
char nbuf[16])
|
|
{
|
|
char *errstr = NULL;
|
|
@@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
|
|
/* __ext4_std_error decodes expected errors from journaling functions
|
|
* automatically and invokes the appropriate error response. */
|
|
|
|
-void __ext4_std_error (struct super_block * sb, const char * function,
|
|
- int errno)
|
|
+void __ext4_std_error(struct super_block *sb, const char *function, int errno)
|
|
{
|
|
char nbuf[16];
|
|
const char *errstr;
|
|
@@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
|
|
return;
|
|
|
|
errstr = ext4_decode_error(sb, errno, nbuf);
|
|
- printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
|
|
- sb->s_id, function, errstr);
|
|
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
|
|
+ sb->s_id, function, errstr);
|
|
|
|
ext4_handle_error(sb);
|
|
}
|
|
@@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
|
|
* case we take the easy way out and panic immediately.
|
|
*/
|
|
|
|
-void ext4_abort (struct super_block * sb, const char * function,
|
|
- const char * fmt, ...)
|
|
+void ext4_abort(struct super_block *sb, const char *function,
|
|
+ const char *fmt, ...)
|
|
{
|
|
va_list args;
|
|
|
|
- printk (KERN_CRIT "ext4_abort called.\n");
|
|
+ printk(KERN_CRIT "ext4_abort called.\n");
|
|
|
|
va_start(args, fmt);
|
|
- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
|
|
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
|
|
vprintk(fmt, args);
|
|
printk("\n");
|
|
va_end(args);
|
|
@@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function,
|
|
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
|
|
}
|
|
|
|
-void ext4_warning (struct super_block * sb, const char * function,
|
|
- const char * fmt, ...)
|
|
+void ext4_warning(struct super_block *sb, const char *function,
|
|
+ const char *fmt, ...)
|
|
{
|
|
va_list args;
|
|
|
|
@@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
|
|
}
|
|
}
|
|
|
|
-static void ext4_put_super (struct super_block * sb)
|
|
+static void ext4_put_super(struct super_block *sb)
|
|
{
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
struct ext4_super_block *es = sbi->s_es;
|
|
@@ -506,6 +504,7 @@ static void ext4_put_super (struct super_block * sb)
|
|
ext4_ext_release(sb);
|
|
ext4_xattr_put_super(sb);
|
|
jbd2_journal_destroy(sbi->s_journal);
|
|
+ sbi->s_journal = NULL;
|
|
if (!(sb->s_flags & MS_RDONLY)) {
|
|
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
|
|
es->s_state = cpu_to_le16(sbi->s_mount_state);
|
|
@@ -517,6 +516,7 @@ static void ext4_put_super (struct super_block * sb)
|
|
for (i = 0; i < sbi->s_gdb_count; i++)
|
|
brelse(sbi->s_group_desc[i]);
|
|
kfree(sbi->s_group_desc);
|
|
+ kfree(sbi->s_flex_groups);
|
|
percpu_counter_destroy(&sbi->s_freeblocks_counter);
|
|
percpu_counter_destroy(&sbi->s_freeinodes_counter);
|
|
percpu_counter_destroy(&sbi->s_dirs_counter);
|
|
@@ -568,9 +568,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
|
|
#endif
|
|
ei->i_block_alloc_info = NULL;
|
|
ei->vfs_inode.i_version = 1;
|
|
+ ei->vfs_inode.i_data.writeback_index = 0;
|
|
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
|
|
INIT_LIST_HEAD(&ei->i_prealloc_list);
|
|
spin_lock_init(&ei->i_prealloc_lock);
|
|
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
|
|
+ ei->i_reserved_data_blocks = 0;
|
|
+ ei->i_reserved_meta_blocks = 0;
|
|
+ ei->i_allocated_meta_blocks = 0;
|
|
+ ei->i_delalloc_reserved_flag = 0;
|
|
+ spin_lock_init(&(ei->i_block_reservation_lock));
|
|
return &ei->vfs_inode;
|
|
}
|
|
|
|
@@ -635,9 +642,12 @@ static void ext4_clear_inode(struct inode *inode)
|
|
EXT4_I(inode)->i_block_alloc_info = NULL;
|
|
if (unlikely(rsv))
|
|
kfree(rsv);
|
|
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
|
|
+ &EXT4_I(inode)->jinode);
|
|
}
|
|
|
|
-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
|
|
+static inline void ext4_show_quota_options(struct seq_file *seq,
|
|
+ struct super_block *sb)
|
|
{
|
|
#if defined(CONFIG_QUOTA)
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
|
|
unsigned long def_mount_opts;
|
|
struct super_block *sb = vfs->mnt_sb;
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
- journal_t *journal = sbi->s_journal;
|
|
struct ext4_super_block *es = sbi->s_es;
|
|
|
|
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
|
|
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
|
|
seq_puts(seq, ",nomballoc");
|
|
if (test_opt(sb, I_VERSION))
|
|
seq_puts(seq, ",i_version");
|
|
+ if (!test_opt(sb, DELALLOC))
|
|
+ seq_puts(seq, ",nodelalloc");
|
|
+
|
|
|
|
if (sbi->s_stripe)
|
|
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
|
|
@@ -810,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
|
|
}
|
|
|
|
#ifdef CONFIG_QUOTA
|
|
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
|
|
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
|
|
+#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
|
|
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
|
|
|
|
static int ext4_dquot_initialize(struct inode *inode, int type);
|
|
static int ext4_dquot_drop(struct inode *inode);
|
|
@@ -894,7 +906,7 @@ enum {
|
|
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
|
|
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
|
|
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
|
|
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
|
|
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
|
|
};
|
|
|
|
static match_table_t tokens = {
|
|
@@ -953,6 +965,8 @@ static match_table_t tokens = {
|
|
{Opt_nomballoc, "nomballoc"},
|
|
{Opt_stripe, "stripe=%u"},
|
|
{Opt_resize, "resize"},
|
|
+ {Opt_delalloc, "delalloc"},
|
|
+ {Opt_nodelalloc, "nodelalloc"},
|
|
{Opt_err, NULL},
|
|
};
|
|
|
|
@@ -977,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data)
|
|
return sb_block;
|
|
}
|
|
|
|
-static int parse_options (char *options, struct super_block *sb,
|
|
- unsigned int *inum, unsigned long *journal_devnum,
|
|
- ext4_fsblk_t *n_blocks_count, int is_remount)
|
|
+static int parse_options(char *options, struct super_block *sb,
|
|
+ unsigned int *inum, unsigned long *journal_devnum,
|
|
+ ext4_fsblk_t *n_blocks_count, int is_remount)
|
|
{
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
- char * p;
|
|
+ char *p;
|
|
substring_t args[MAX_OPT_ARGS];
|
|
int data_opt = 0;
|
|
int option;
|
|
@@ -990,11 +1004,12 @@ static int parse_options (char *options, struct super_block *sb,
|
|
int qtype, qfmt;
|
|
char *qname;
|
|
#endif
|
|
+ ext4_fsblk_t last_block;
|
|
|
|
if (!options)
|
|
return 1;
|
|
|
|
- while ((p = strsep (&options, ",")) != NULL) {
|
|
+ while ((p = strsep(&options, ",")) != NULL) {
|
|
int token;
|
|
if (!*p)
|
|
continue;
|
|
@@ -1002,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb,
|
|
token = match_token(p, tokens, args);
|
|
switch (token) {
|
|
case Opt_bsd_df:
|
|
- clear_opt (sbi->s_mount_opt, MINIX_DF);
|
|
+ clear_opt(sbi->s_mount_opt, MINIX_DF);
|
|
break;
|
|
case Opt_minix_df:
|
|
- set_opt (sbi->s_mount_opt, MINIX_DF);
|
|
+ set_opt(sbi->s_mount_opt, MINIX_DF);
|
|
break;
|
|
case Opt_grpid:
|
|
- set_opt (sbi->s_mount_opt, GRPID);
|
|
+ set_opt(sbi->s_mount_opt, GRPID);
|
|
break;
|
|
case Opt_nogrpid:
|
|
- clear_opt (sbi->s_mount_opt, GRPID);
|
|
+ clear_opt(sbi->s_mount_opt, GRPID);
|
|
break;
|
|
case Opt_resuid:
|
|
if (match_int(&args[0], &option))
|
|
@@ -1028,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb,
|
|
/* *sb_block = match_int(&args[0]); */
|
|
break;
|
|
case Opt_err_panic:
|
|
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
|
|
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
|
|
- set_opt (sbi->s_mount_opt, ERRORS_PANIC);
|
|
+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
|
|
+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
|
|
+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
|
|
break;
|
|
case Opt_err_ro:
|
|
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
|
|
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
|
|
- set_opt (sbi->s_mount_opt, ERRORS_RO);
|
|
+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
|
|
+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
|
|
+ set_opt(sbi->s_mount_opt, ERRORS_RO);
|
|
break;
|
|
case Opt_err_cont:
|
|
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
|
|
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
|
|
- set_opt (sbi->s_mount_opt, ERRORS_CONT);
|
|
+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
|
|
+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
|
|
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
|
|
break;
|
|
case Opt_nouid32:
|
|
- set_opt (sbi->s_mount_opt, NO_UID32);
|
|
+ set_opt(sbi->s_mount_opt, NO_UID32);
|
|
break;
|
|
case Opt_nocheck:
|
|
- clear_opt (sbi->s_mount_opt, CHECK);
|
|
+ clear_opt(sbi->s_mount_opt, CHECK);
|
|
break;
|
|
case Opt_debug:
|
|
- set_opt (sbi->s_mount_opt, DEBUG);
|
|
+ set_opt(sbi->s_mount_opt, DEBUG);
|
|
break;
|
|
case Opt_oldalloc:
|
|
- set_opt (sbi->s_mount_opt, OLDALLOC);
|
|
+ set_opt(sbi->s_mount_opt, OLDALLOC);
|
|
break;
|
|
case Opt_orlov:
|
|
- clear_opt (sbi->s_mount_opt, OLDALLOC);
|
|
+ clear_opt(sbi->s_mount_opt, OLDALLOC);
|
|
break;
|
|
#ifdef CONFIG_EXT4DEV_FS_XATTR
|
|
case Opt_user_xattr:
|
|
- set_opt (sbi->s_mount_opt, XATTR_USER);
|
|
+ set_opt(sbi->s_mount_opt, XATTR_USER);
|
|
break;
|
|
case Opt_nouser_xattr:
|
|
- clear_opt (sbi->s_mount_opt, XATTR_USER);
|
|
+ clear_opt(sbi->s_mount_opt, XATTR_USER);
|
|
break;
|
|
#else
|
|
case Opt_user_xattr:
|
|
@@ -1100,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb,
|
|
"journal on remount\n");
|
|
return 0;
|
|
}
|
|
- set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
|
|
+ set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
|
|
break;
|
|
case Opt_journal_inum:
|
|
if (is_remount) {
|
|
@@ -1130,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb,
|
|
set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
|
|
break;
|
|
case Opt_noload:
|
|
- set_opt (sbi->s_mount_opt, NOLOAD);
|
|
+ set_opt(sbi->s_mount_opt, NOLOAD);
|
|
break;
|
|
case Opt_commit:
|
|
if (match_int(&args[0], &option))
|
|
@@ -1309,15 +1324,39 @@ set_qf_format:
|
|
clear_opt(sbi->s_mount_opt, NOBH);
|
|
break;
|
|
case Opt_extents:
|
|
- set_opt (sbi->s_mount_opt, EXTENTS);
|
|
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
|
|
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
|
|
+ ext4_warning(sb, __func__,
|
|
+ "extents feature not enabled "
|
|
+ "on this filesystem, use tune2fs\n");
|
|
+ return 0;
|
|
+ }
|
|
+ set_opt(sbi->s_mount_opt, EXTENTS);
|
|
break;
|
|
case Opt_noextents:
|
|
- clear_opt (sbi->s_mount_opt, EXTENTS);
|
|
+ /*
|
|
+ * When e2fsprogs support resizing an already existing
|
|
+ * ext3 file system to greater than 2**32 we need to
|
|
+ * add support to block allocator to handle growing
|
|
+ * already existing block mapped inode so that blocks
|
|
+ * allocated for them fall within 2**32
|
|
+ */
|
|
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
|
|
+ if (last_block > 0xffffffffULL) {
|
|
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
|
|
+ "large to mount with "
|
|
+ "-o noextents options\n");
|
|
+ return 0;
|
|
+ }
|
|
+ clear_opt(sbi->s_mount_opt, EXTENTS);
|
|
break;
|
|
case Opt_i_version:
|
|
set_opt(sbi->s_mount_opt, I_VERSION);
|
|
sb->s_flags |= MS_I_VERSION;
|
|
break;
|
|
+ case Opt_nodelalloc:
|
|
+ clear_opt(sbi->s_mount_opt, DELALLOC);
|
|
+ break;
|
|
case Opt_mballoc:
|
|
set_opt(sbi->s_mount_opt, MBALLOC);
|
|
break;
|
|
@@ -1331,10 +1370,13 @@ set_qf_format:
|
|
return 0;
|
|
sbi->s_stripe = option;
|
|
break;
|
|
+ case Opt_delalloc:
|
|
+ set_opt(sbi->s_mount_opt, DELALLOC);
|
|
+ break;
|
|
default:
|
|
- printk (KERN_ERR
|
|
- "EXT4-fs: Unrecognized mount option \"%s\" "
|
|
- "or missing value\n", p);
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: Unrecognized mount option \"%s\" "
|
|
+ "or missing value\n", p);
|
|
return 0;
|
|
}
|
|
}
|
|
@@ -1381,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
|
|
int res = 0;
|
|
|
|
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
|
|
- printk (KERN_ERR "EXT4-fs warning: revision level too high, "
|
|
- "forcing read-only mode\n");
|
|
+ printk(KERN_ERR "EXT4-fs warning: revision level too high, "
|
|
+ "forcing read-only mode\n");
|
|
res = MS_RDONLY;
|
|
}
|
|
if (read_only)
|
|
return res;
|
|
if (!(sbi->s_mount_state & EXT4_VALID_FS))
|
|
- printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
|
|
- "running e2fsck is recommended\n");
|
|
+ printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
|
|
+ "running e2fsck is recommended\n");
|
|
else if ((sbi->s_mount_state & EXT4_ERROR_FS))
|
|
- printk (KERN_WARNING
|
|
- "EXT4-fs warning: mounting fs with errors, "
|
|
- "running e2fsck is recommended\n");
|
|
+ printk(KERN_WARNING
|
|
+ "EXT4-fs warning: mounting fs with errors, "
|
|
+ "running e2fsck is recommended\n");
|
|
else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
|
|
le16_to_cpu(es->s_mnt_count) >=
|
|
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
|
|
- printk (KERN_WARNING
|
|
- "EXT4-fs warning: maximal mount count reached, "
|
|
- "running e2fsck is recommended\n");
|
|
+ printk(KERN_WARNING
|
|
+ "EXT4-fs warning: maximal mount count reached, "
|
|
+ "running e2fsck is recommended\n");
|
|
else if (le32_to_cpu(es->s_checkinterval) &&
|
|
(le32_to_cpu(es->s_lastcheck) +
|
|
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
|
|
- printk (KERN_WARNING
|
|
- "EXT4-fs warning: checktime reached, "
|
|
- "running e2fsck is recommended\n");
|
|
+ printk(KERN_WARNING
|
|
+ "EXT4-fs warning: checktime reached, "
|
|
+ "running e2fsck is recommended\n");
|
|
#if 0
|
|
/* @@@ We _will_ want to clear the valid bit if we find
|
|
* inconsistencies, to force a fsck at reboot. But for
|
|
@@ -1443,6 +1485,53 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
|
|
return res;
|
|
}
|
|
|
|
+static int ext4_fill_flex_info(struct super_block *sb)
|
|
+{
|
|
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
+ struct ext4_group_desc *gdp = NULL;
|
|
+ struct buffer_head *bh;
|
|
+ ext4_group_t flex_group_count;
|
|
+ ext4_group_t flex_group;
|
|
+ int groups_per_flex = 0;
|
|
+ __u64 block_bitmap = 0;
|
|
+ int i;
|
|
+
|
|
+ if (!sbi->s_es->s_log_groups_per_flex) {
|
|
+ sbi->s_log_groups_per_flex = 0;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
|
|
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
|
|
+
|
|
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
|
|
+ groups_per_flex;
|
|
+ sbi->s_flex_groups = kzalloc(flex_group_count *
|
|
+ sizeof(struct flex_groups), GFP_KERNEL);
|
|
+ if (sbi->s_flex_groups == NULL) {
|
|
+ printk(KERN_ERR "EXT4-fs: not enough memory for "
|
|
+ "%lu flex groups\n", flex_group_count);
|
|
+ goto failed;
|
|
+ }
|
|
+
|
|
+ gdp = ext4_get_group_desc(sb, 1, &bh);
|
|
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
|
|
+
|
|
+ for (i = 0; i < sbi->s_groups_count; i++) {
|
|
+ gdp = ext4_get_group_desc(sb, i, &bh);
|
|
+
|
|
+ flex_group = ext4_flex_group(sbi, i);
|
|
+ sbi->s_flex_groups[flex_group].free_inodes +=
|
|
+ le16_to_cpu(gdp->bg_free_inodes_count);
|
|
+ sbi->s_flex_groups[flex_group].free_blocks +=
|
|
+ le16_to_cpu(gdp->bg_free_blocks_count);
|
|
+ }
|
|
+
|
|
+ return 1;
|
|
+failed:
|
|
+ return 0;
|
|
+}
|
|
+
|
|
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
|
|
struct ext4_group_desc *gdp)
|
|
{
|
|
@@ -1507,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb)
|
|
(EXT4_BLOCKS_PER_GROUP(sb) - 1);
|
|
|
|
block_bitmap = ext4_block_bitmap(sb, gdp);
|
|
- if (block_bitmap < first_block || block_bitmap > last_block)
|
|
- {
|
|
+ if (block_bitmap < first_block || block_bitmap > last_block) {
|
|
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
|
|
"Block bitmap for group %lu not in group "
|
|
"(block %llu)!", i, block_bitmap);
|
|
return 0;
|
|
}
|
|
inode_bitmap = ext4_inode_bitmap(sb, gdp);
|
|
- if (inode_bitmap < first_block || inode_bitmap > last_block)
|
|
- {
|
|
+ if (inode_bitmap < first_block || inode_bitmap > last_block) {
|
|
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
|
|
"Inode bitmap for group %lu not in group "
|
|
"(block %llu)!", i, inode_bitmap);
|
|
@@ -1524,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb)
|
|
}
|
|
inode_table = ext4_inode_table(sb, gdp);
|
|
if (inode_table < first_block ||
|
|
- inode_table + sbi->s_itb_per_group - 1 > last_block)
|
|
- {
|
|
+ inode_table + sbi->s_itb_per_group - 1 > last_block) {
|
|
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
|
|
"Inode table for group %lu not in group "
|
|
"(block %llu)!", i, inode_table);
|
|
return 0;
|
|
}
|
|
+ spin_lock(sb_bgl_lock(sbi, i));
|
|
if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
|
|
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
|
|
"Checksum for group %lu failed (%u!=%u)\n",
|
|
i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
|
|
gdp)), le16_to_cpu(gdp->bg_checksum));
|
|
- return 0;
|
|
+ if (!(sb->s_flags & MS_RDONLY))
|
|
+ return 0;
|
|
}
|
|
+ spin_unlock(sb_bgl_lock(sbi, i));
|
|
if (!flexbg_flag)
|
|
first_block += EXT4_BLOCKS_PER_GROUP(sb);
|
|
}
|
|
|
|
ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
|
|
- sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
|
|
+ sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
|
|
return 1;
|
|
}
|
|
|
|
@@ -1564,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb)
|
|
* e2fsck was run on this filesystem, and it must have already done the orphan
|
|
* inode cleanup for us, so we can safely abort without any further action.
|
|
*/
|
|
-static void ext4_orphan_cleanup (struct super_block * sb,
|
|
- struct ext4_super_block * es)
|
|
+static void ext4_orphan_cleanup(struct super_block *sb,
|
|
+ struct ext4_super_block *es)
|
|
{
|
|
unsigned int s_flags = sb->s_flags;
|
|
int nr_orphans = 0, nr_truncates = 0;
|
|
@@ -1642,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
|
|
iput(inode); /* The delete magic happens here! */
|
|
}
|
|
|
|
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
|
|
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
|
|
|
|
if (nr_orphans)
|
|
printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
|
|
@@ -1809,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
|
|
return 0;
|
|
}
|
|
|
|
-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
- __releases(kernel_sem)
|
|
- __acquires(kernel_sem)
|
|
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
|
|
+ __releases(kernel_lock)
|
|
+ __acquires(kernel_lock)
|
|
|
|
{
|
|
- struct buffer_head * bh;
|
|
+ struct buffer_head *bh;
|
|
struct ext4_super_block *es = NULL;
|
|
struct ext4_sb_info *sbi;
|
|
ext4_fsblk_t block;
|
|
@@ -1851,11 +1940,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
goto out_fail;
|
|
}
|
|
|
|
- if (!sb_set_blocksize(sb, blocksize)) {
|
|
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
|
|
- goto out_fail;
|
|
- }
|
|
-
|
|
/*
|
|
* The ext4 superblock will not be buffer aligned for other than 1kB
|
|
* block sizes. We need to calculate the offset from buffer start.
|
|
@@ -1868,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
}
|
|
|
|
if (!(bh = sb_bread(sb, logical_sb_block))) {
|
|
- printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
|
|
+ printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
|
|
goto out_fail;
|
|
}
|
|
/*
|
|
@@ -1919,17 +2003,30 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
|
|
/*
|
|
* turn on extents feature by default in ext4 filesystem
|
|
- * User -o noextents to turn it off
|
|
+ * only if feature flag already set by mkfs or tune2fs.
|
|
+ * Use -o noextents to turn it off
|
|
*/
|
|
- set_opt(sbi->s_mount_opt, EXTENTS);
|
|
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
|
|
+ set_opt(sbi->s_mount_opt, EXTENTS);
|
|
+ else
|
|
+ ext4_warning(sb, __func__,
|
|
+ "extents feature not enabled on this filesystem, "
|
|
+ "use tune2fs.\n");
|
|
/*
|
|
- * turn on mballoc feature by default in ext4 filesystem
|
|
- * User -o nomballoc to turn it off
|
|
+ * turn on mballoc code by default in ext4 filesystem
|
|
+ * Use -o nomballoc to turn it off
|
|
*/
|
|
set_opt(sbi->s_mount_opt, MBALLOC);
|
|
|
|
- if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
|
|
- NULL, 0))
|
|
+ /*
|
|
+ * enable delayed allocation by default
|
|
+ * Use -o nodelalloc to turn it off
|
|
+ */
|
|
+ set_opt(sbi->s_mount_opt, DELALLOC);
|
|
+
|
|
+
|
|
+ if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
|
|
+ NULL, 0))
|
|
goto failed_mount;
|
|
|
|
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
|
|
@@ -2004,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
goto failed_mount;
|
|
}
|
|
|
|
- brelse (bh);
|
|
+ brelse(bh);
|
|
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
|
|
offset = do_div(logical_sb_block, blocksize);
|
|
bh = sb_bread(sb, logical_sb_block);
|
|
@@ -2016,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
|
|
sbi->s_es = es;
|
|
if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
|
|
- printk (KERN_ERR
|
|
- "EXT4-fs: Magic mismatch, very weird !\n");
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: Magic mismatch, very weird !\n");
|
|
goto failed_mount;
|
|
}
|
|
}
|
|
@@ -2034,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
|
|
(!is_power_of_2(sbi->s_inode_size)) ||
|
|
(sbi->s_inode_size > blocksize)) {
|
|
- printk (KERN_ERR
|
|
- "EXT4-fs: unsupported inode size: %d\n",
|
|
- sbi->s_inode_size);
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: unsupported inode size: %d\n",
|
|
+ sbi->s_inode_size);
|
|
goto failed_mount;
|
|
}
|
|
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
|
|
@@ -2068,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
sbi->s_mount_state = le16_to_cpu(es->s_state);
|
|
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
|
|
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
|
|
- for (i=0; i < 4; i++)
|
|
+ for (i = 0; i < 4; i++)
|
|
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
|
|
sbi->s_def_hash_version = es->s_def_hash_version;
|
|
|
|
if (sbi->s_blocks_per_group > blocksize * 8) {
|
|
- printk (KERN_ERR
|
|
- "EXT4-fs: #blocks per group too big: %lu\n",
|
|
- sbi->s_blocks_per_group);
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: #blocks per group too big: %lu\n",
|
|
+ sbi->s_blocks_per_group);
|
|
goto failed_mount;
|
|
}
|
|
if (sbi->s_inodes_per_group > blocksize * 8) {
|
|
- printk (KERN_ERR
|
|
- "EXT4-fs: #inodes per group too big: %lu\n",
|
|
- sbi->s_inodes_per_group);
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: #inodes per group too big: %lu\n",
|
|
+ sbi->s_inodes_per_group);
|
|
goto failed_mount;
|
|
}
|
|
|
|
@@ -2115,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
sbi->s_groups_count = blocks_count;
|
|
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
|
|
EXT4_DESC_PER_BLOCK(sb);
|
|
- sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
|
|
+ sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
|
|
GFP_KERNEL);
|
|
if (sbi->s_group_desc == NULL) {
|
|
- printk (KERN_ERR "EXT4-fs: not enough memory\n");
|
|
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
|
|
goto failed_mount;
|
|
}
|
|
|
|
@@ -2128,16 +2225,24 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
block = descriptor_loc(sb, logical_sb_block, i);
|
|
sbi->s_group_desc[i] = sb_bread(sb, block);
|
|
if (!sbi->s_group_desc[i]) {
|
|
- printk (KERN_ERR "EXT4-fs: "
|
|
- "can't read group descriptor %d\n", i);
|
|
+ printk(KERN_ERR "EXT4-fs: "
|
|
+ "can't read group descriptor %d\n", i);
|
|
db_count = i;
|
|
goto failed_mount2;
|
|
}
|
|
}
|
|
- if (!ext4_check_descriptors (sb)) {
|
|
+ if (!ext4_check_descriptors(sb)) {
|
|
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
|
|
goto failed_mount2;
|
|
}
|
|
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
|
|
+ if (!ext4_fill_flex_info(sb)) {
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: unable to initialize "
|
|
+ "flex_bg meta info!\n");
|
|
+ goto failed_mount2;
|
|
+ }
|
|
+
|
|
sbi->s_gdb_count = db_count;
|
|
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
|
|
spin_lock_init(&sbi->s_next_gen_lock);
|
|
@@ -2202,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
EXT4_SB(sb)->s_journal->j_failed_commit) {
|
|
printk(KERN_CRIT "EXT4-fs error (device %s): "
|
|
"ext4_fill_super: Journal transaction "
|
|
- "%u is corrupt\n", sb->s_id,
|
|
+ "%u is corrupt\n", sb->s_id,
|
|
EXT4_SB(sb)->s_journal->j_failed_commit);
|
|
- if (test_opt (sb, ERRORS_RO)) {
|
|
- printk (KERN_CRIT
|
|
- "Mounting filesystem read-only\n");
|
|
+ if (test_opt(sb, ERRORS_RO)) {
|
|
+ printk(KERN_CRIT
|
|
+ "Mounting filesystem read-only\n");
|
|
sb->s_flags |= MS_RDONLY;
|
|
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
|
|
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
|
|
@@ -2226,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
goto failed_mount3;
|
|
} else {
|
|
if (!silent)
|
|
- printk (KERN_ERR
|
|
- "ext4: No journal on filesystem on %s\n",
|
|
- sb->s_id);
|
|
+ printk(KERN_ERR
|
|
+ "ext4: No journal on filesystem on %s\n",
|
|
+ sb->s_id);
|
|
goto failed_mount3;
|
|
}
|
|
|
|
@@ -2312,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
goto failed_mount4;
|
|
}
|
|
|
|
- ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
|
|
+ ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
|
|
|
|
/* determine the minimum size of new large inodes, if present */
|
|
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
|
|
@@ -2351,12 +2456,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
|
|
ext4_orphan_cleanup(sb, es);
|
|
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
|
|
if (needs_recovery)
|
|
- printk (KERN_INFO "EXT4-fs: recovery complete.\n");
|
|
+ printk(KERN_INFO "EXT4-fs: recovery complete.\n");
|
|
ext4_mark_recovery_complete(sb, es);
|
|
- printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
|
|
- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
|
|
- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
|
|
- "writeback");
|
|
+ printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
|
|
+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
|
|
+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
|
|
+ "writeback");
|
|
+
|
|
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
|
|
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
|
|
+ "requested data journaling mode\n");
|
|
+ clear_opt(sbi->s_mount_opt, DELALLOC);
|
|
+ } else if (test_opt(sb, DELALLOC))
|
|
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
|
|
|
|
ext4_ext_init(sb);
|
|
ext4_mb_init(sb, needs_recovery);
|
|
@@ -2372,6 +2484,7 @@ cantfind_ext4:
|
|
|
|
failed_mount4:
|
|
jbd2_journal_destroy(sbi->s_journal);
|
|
+ sbi->s_journal = NULL;
|
|
failed_mount3:
|
|
percpu_counter_destroy(&sbi->s_freeblocks_counter);
|
|
percpu_counter_destroy(&sbi->s_freeinodes_counter);
|
|
@@ -2461,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
|
|
static journal_t *ext4_get_dev_journal(struct super_block *sb,
|
|
dev_t j_dev)
|
|
{
|
|
- struct buffer_head * bh;
|
|
+ struct buffer_head *bh;
|
|
journal_t *journal;
|
|
ext4_fsblk_t start;
|
|
ext4_fsblk_t len;
|
|
int hblock, blocksize;
|
|
ext4_fsblk_t sb_block;
|
|
unsigned long offset;
|
|
- struct ext4_super_block * es;
|
|
+ struct ext4_super_block *es;
|
|
struct block_device *bdev;
|
|
|
|
bdev = ext4_blkdev_get(j_dev);
|
|
@@ -2583,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb,
|
|
"unavailable, cannot proceed.\n");
|
|
return -EROFS;
|
|
}
|
|
- printk (KERN_INFO "EXT4-fs: write access will "
|
|
- "be enabled during recovery.\n");
|
|
+ printk(KERN_INFO "EXT4-fs: write access will "
|
|
+ "be enabled during recovery.\n");
|
|
}
|
|
}
|
|
|
|
@@ -2637,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb,
|
|
return 0;
|
|
}
|
|
|
|
-static int ext4_create_journal(struct super_block * sb,
|
|
- struct ext4_super_block * es,
|
|
+static int ext4_create_journal(struct super_block *sb,
|
|
+ struct ext4_super_block *es,
|
|
unsigned int journal_inum)
|
|
{
|
|
journal_t *journal;
|
|
@@ -2679,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb,
|
|
return 0;
|
|
}
|
|
|
|
-static void ext4_commit_super (struct super_block * sb,
|
|
- struct ext4_super_block * es,
|
|
- int sync)
|
|
+static void ext4_commit_super(struct super_block *sb,
|
|
+ struct ext4_super_block *es, int sync)
|
|
{
|
|
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
|
|
|
|
@@ -2702,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb,
|
|
* remounting) the filesystem readonly, then we will end up with a
|
|
* consistent fs on disk. Record that fact.
|
|
*/
|
|
-static void ext4_mark_recovery_complete(struct super_block * sb,
|
|
- struct ext4_super_block * es)
|
|
+static void ext4_mark_recovery_complete(struct super_block *sb,
|
|
+ struct ext4_super_block *es)
|
|
{
|
|
journal_t *journal = EXT4_SB(sb)->s_journal;
|
|
|
|
@@ -2725,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
|
|
* has recorded an error from a previous lifetime, move that error to the
|
|
* main filesystem now.
|
|
*/
|
|
-static void ext4_clear_journal_err(struct super_block * sb,
|
|
- struct ext4_super_block * es)
|
|
+static void ext4_clear_journal_err(struct super_block *sb,
|
|
+ struct ext4_super_block *es)
|
|
{
|
|
journal_t *journal;
|
|
int j_errno;
|
|
@@ -2751,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
|
|
|
|
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
|
|
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
|
|
- ext4_commit_super (sb, es, 1);
|
|
+ ext4_commit_super(sb, es, 1);
|
|
|
|
jbd2_journal_clear_err(journal);
|
|
}
|
|
@@ -2784,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb)
|
|
* This implicitly triggers the writebehind on sync().
|
|
*/
|
|
|
|
-static void ext4_write_super (struct super_block * sb)
|
|
+static void ext4_write_super(struct super_block *sb)
|
|
{
|
|
if (mutex_trylock(&sb->s_lock) != 0)
|
|
BUG();
|
|
@@ -2840,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb)
|
|
}
|
|
}
|
|
|
|
-static int ext4_remount (struct super_block * sb, int * flags, char * data)
|
|
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
|
|
{
|
|
- struct ext4_super_block * es;
|
|
+ struct ext4_super_block *es;
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
ext4_fsblk_t n_blocks_count = 0;
|
|
unsigned long old_sb_flags;
|
|
struct ext4_mount_options old_opts;
|
|
+ ext4_group_t g;
|
|
int err;
|
|
#ifdef CONFIG_QUOTA
|
|
int i;
|
|
@@ -2925,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
|
|
}
|
|
|
|
/*
|
|
+ * Make sure the group descriptor checksums
|
|
+ * are sane. If they aren't, refuse to
|
|
+ * remount r/w.
|
|
+ */
|
|
+ for (g = 0; g < sbi->s_groups_count; g++) {
|
|
+ struct ext4_group_desc *gdp =
|
|
+ ext4_get_group_desc(sb, g, NULL);
|
|
+
|
|
+ if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
|
|
+ printk(KERN_ERR
|
|
+ "EXT4-fs: ext4_remount: "
|
|
+ "Checksum for group %lu failed (%u!=%u)\n",
|
|
+ g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
|
|
+ le16_to_cpu(gdp->bg_checksum));
|
|
+ err = -EINVAL;
|
|
+ goto restore_opts;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
* If we have an unprocessed orphan list hanging
|
|
* around from a previously readonly bdev mount,
|
|
* require a full umount/remount for now.
|
|
@@ -2949,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
|
|
sbi->s_mount_state = le16_to_cpu(es->s_state);
|
|
if ((err = ext4_group_extend(sb, es, n_blocks_count)))
|
|
goto restore_opts;
|
|
- if (!ext4_setup_super (sb, es, 0))
|
|
+ if (!ext4_setup_super(sb, es, 0))
|
|
sb->s_flags &= ~MS_RDONLY;
|
|
}
|
|
}
|
|
@@ -2979,7 +3112,7 @@ restore_opts:
|
|
return err;
|
|
}
|
|
|
|
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
|
|
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
|
|
{
|
|
struct super_block *sb = dentry->d_sb;
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
@@ -3217,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
|
|
}
|
|
/* Journaling quota? */
|
|
if (EXT4_SB(sb)->s_qf_names[type]) {
|
|
- /* Quotafile not of fs root? */
|
|
+ /* Quotafile not in fs root? */
|
|
if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
|
|
printk(KERN_WARNING
|
|
"EXT4-fs: Quota file not on filesystem root. "
|
|
"Journaled quota will not work.\n");
|
|
- }
|
|
+ }
|
|
|
|
/*
|
|
* When we journal data on quota file, we have to flush journal to see
|
|
@@ -3325,7 +3458,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
|
|
err = ext4_journal_dirty_metadata(handle, bh);
|
|
else {
|
|
/* Always do at least ordered writes for quotas */
|
|
- err = ext4_journal_dirty_data(handle, bh);
|
|
+ err = ext4_jbd2_file_inode(handle, inode);
|
|
mark_buffer_dirty(bh);
|
|
}
|
|
brelse(bh);
|
|
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
|
|
index ff08633..8954208 100644
|
|
--- a/fs/ext4/xattr.c
|
|
+++ b/fs/ext4/xattr.c
|
|
@@ -810,7 +810,7 @@ inserted:
|
|
/* We need to allocate a new block */
|
|
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
|
|
EXT4_I(inode)->i_block_group);
|
|
- ext4_fsblk_t block = ext4_new_block(handle, inode,
|
|
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
|
|
goal, &error);
|
|
if (error)
|
|
goto cleanup;
|
|
@@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
|
|
char *name = entry->e_name;
|
|
int n;
|
|
|
|
- for (n=0; n < entry->e_name_len; n++) {
|
|
+ for (n = 0; n < entry->e_name_len; n++) {
|
|
hash = (hash << NAME_HASH_SHIFT) ^
|
|
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
|
|
*name++;
|
|
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
|
|
index fff3338..ac1a52c 100644
|
|
--- a/fs/ext4/xattr_trusted.c
|
|
+++ b/fs/ext4/xattr_trusted.c
|
|
@@ -13,13 +13,11 @@
|
|
#include "ext4.h"
|
|
#include "xattr.h"
|
|
|
|
-#define XATTR_TRUSTED_PREFIX "trusted."
|
|
-
|
|
static size_t
|
|
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
|
|
const char *name, size_t name_len)
|
|
{
|
|
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
|
|
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
|
|
const size_t total_len = prefix_len + name_len + 1;
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
|
|
index 67be723..d91aa61 100644
|
|
--- a/fs/ext4/xattr_user.c
|
|
+++ b/fs/ext4/xattr_user.c
|
|
@@ -12,13 +12,11 @@
|
|
#include "ext4.h"
|
|
#include "xattr.h"
|
|
|
|
-#define XATTR_USER_PREFIX "user."
|
|
-
|
|
static size_t
|
|
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
|
|
const char *name, size_t name_len)
|
|
{
|
|
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
|
|
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
|
|
const size_t total_len = prefix_len + name_len + 1;
|
|
|
|
if (!test_opt(inode->i_sb, XATTR_USER))
|
|
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
|
|
index 6914598..91389c8 100644
|
|
--- a/fs/jbd2/checkpoint.c
|
|
+++ b/fs/jbd2/checkpoint.c
|
|
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
|
|
|
|
J_ASSERT(transaction->t_state == T_FINISHED);
|
|
J_ASSERT(transaction->t_buffers == NULL);
|
|
- J_ASSERT(transaction->t_sync_datalist == NULL);
|
|
J_ASSERT(transaction->t_forget == NULL);
|
|
J_ASSERT(transaction->t_iobuf_list == NULL);
|
|
J_ASSERT(transaction->t_shadow_list == NULL);
|
|
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
|
|
index a2ed72f..adf0395 100644
|
|
--- a/fs/jbd2/commit.c
|
|
+++ b/fs/jbd2/commit.c
|
|
@@ -22,6 +22,8 @@
|
|
#include <linux/pagemap.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/crc32.h>
|
|
+#include <linux/writeback.h>
|
|
+#include <linux/backing-dev.h>
|
|
|
|
/*
|
|
* Default IO end handler for temporary BJ_IO buffer_heads.
|
|
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
|
|
}
|
|
|
|
/*
|
|
- * When an ext3-ordered file is truncated, it is possible that many pages are
|
|
- * not sucessfully freed, because they are attached to a committing transaction.
|
|
+ * When an ext4 file is truncated, it is possible that some pages are not
|
|
+ * successfully freed, because they are attached to a committing transaction.
|
|
* After the transaction commits, these pages are left on the LRU, with no
|
|
* ->mapping, and with attached buffers. These pages are trivially reclaimable
|
|
* by the VM, but their apparent absence upsets the VM accounting, and it makes
|
|
@@ -80,21 +82,6 @@ nope:
|
|
}
|
|
|
|
/*
|
|
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
|
|
- * held. For ranking reasons we must trylock. If we lose, schedule away and
|
|
- * return 0. j_list_lock is dropped in this case.
|
|
- */
|
|
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
|
|
-{
|
|
- if (!jbd_trylock_bh_state(bh)) {
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- schedule();
|
|
- return 0;
|
|
- }
|
|
- return 1;
|
|
-}
|
|
-
|
|
-/*
|
|
* Done it all: now submit the commit record. We should have
|
|
* cleaned up our previous buffers by now, so if we are in abort
|
|
* mode we can now just skip the rest of the journal write
|
|
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
|
|
struct buffer_head *bh;
|
|
int ret;
|
|
int barrier_done = 0;
|
|
+ struct timespec now = current_kernel_time();
|
|
|
|
if (is_journal_aborted(journal))
|
|
return 0;
|
|
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
|
|
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
|
|
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
|
|
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
|
|
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
|
|
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
|
|
|
|
if (JBD2_HAS_COMPAT_FEATURE(journal,
|
|
JBD2_FEATURE_COMPAT_CHECKSUM)) {
|
|
@@ -197,159 +187,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
|
|
}
|
|
|
|
/*
|
|
- * Wait for all submitted IO to complete.
|
|
+ * write the filemap data using writepage() address_space_operations.
|
|
+ * We don't do block allocation here even for delalloc. We don't
|
|
+ * use writepages() because with dealyed allocation we may be doing
|
|
+ * block allocation in writepages().
|
|
*/
|
|
-static int journal_wait_on_locked_list(journal_t *journal,
|
|
- transaction_t *commit_transaction)
|
|
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
|
|
{
|
|
- int ret = 0;
|
|
- struct journal_head *jh;
|
|
-
|
|
- while (commit_transaction->t_locked_list) {
|
|
- struct buffer_head *bh;
|
|
-
|
|
- jh = commit_transaction->t_locked_list->b_tprev;
|
|
- bh = jh2bh(jh);
|
|
- get_bh(bh);
|
|
- if (buffer_locked(bh)) {
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- wait_on_buffer(bh);
|
|
- if (unlikely(!buffer_uptodate(bh)))
|
|
- ret = -EIO;
|
|
- spin_lock(&journal->j_list_lock);
|
|
- }
|
|
- if (!inverted_lock(journal, bh)) {
|
|
- put_bh(bh);
|
|
- spin_lock(&journal->j_list_lock);
|
|
- continue;
|
|
- }
|
|
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
|
|
- __jbd2_journal_unfile_buffer(jh);
|
|
- jbd_unlock_bh_state(bh);
|
|
- jbd2_journal_remove_journal_head(bh);
|
|
- put_bh(bh);
|
|
- } else {
|
|
- jbd_unlock_bh_state(bh);
|
|
- }
|
|
- put_bh(bh);
|
|
- cond_resched_lock(&journal->j_list_lock);
|
|
- }
|
|
+ int ret;
|
|
+ struct writeback_control wbc = {
|
|
+ .sync_mode = WB_SYNC_ALL,
|
|
+ .nr_to_write = mapping->nrpages * 2,
|
|
+ .range_start = 0,
|
|
+ .range_end = i_size_read(mapping->host),
|
|
+ .for_writepages = 1,
|
|
+ };
|
|
+
|
|
+ ret = generic_writepages(mapping, &wbc);
|
|
return ret;
|
|
- }
|
|
+}
|
|
|
|
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
|
|
+/*
|
|
+ * Submit all the data buffers of inode associated with the transaction to
|
|
+ * disk.
|
|
+ *
|
|
+ * We are in a committing transaction. Therefore no new inode can be added to
|
|
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
|
|
+ * operate on from being released while we write out pages.
|
|
+ */
|
|
+static int journal_submit_data_buffers(journal_t *journal,
|
|
+ transaction_t *commit_transaction)
|
|
{
|
|
- int i;
|
|
+ struct jbd2_inode *jinode;
|
|
+ int err, ret = 0;
|
|
+ struct address_space *mapping;
|
|
|
|
- for (i = 0; i < bufs; i++) {
|
|
- wbuf[i]->b_end_io = end_buffer_write_sync;
|
|
- /* We use-up our safety reference in submit_bh() */
|
|
- submit_bh(WRITE, wbuf[i]);
|
|
+ spin_lock(&journal->j_list_lock);
|
|
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
|
|
+ mapping = jinode->i_vfs_inode->i_mapping;
|
|
+ jinode->i_flags |= JI_COMMIT_RUNNING;
|
|
+ spin_unlock(&journal->j_list_lock);
|
|
+ /*
|
|
+ * submit the inode data buffers. We use writepage
|
|
+ * instead of writepages. Because writepages can do
|
|
+ * block allocation with delalloc. We need to write
|
|
+ * only allocated blocks here.
|
|
+ */
|
|
+ err = journal_submit_inode_data_buffers(mapping);
|
|
+ if (!ret)
|
|
+ ret = err;
|
|
+ spin_lock(&journal->j_list_lock);
|
|
+ J_ASSERT(jinode->i_transaction == commit_transaction);
|
|
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
|
|
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
|
|
}
|
|
+ spin_unlock(&journal->j_list_lock);
|
|
+ return ret;
|
|
}
|
|
|
|
/*
|
|
- * Submit all the data buffers to disk
|
|
+ * Wait for data submitted for writeout, refile inodes to proper
|
|
+ * transaction if needed.
|
|
+ *
|
|
*/
|
|
-static void journal_submit_data_buffers(journal_t *journal,
|
|
- transaction_t *commit_transaction)
|
|
+static int journal_finish_inode_data_buffers(journal_t *journal,
|
|
+ transaction_t *commit_transaction)
|
|
{
|
|
- struct journal_head *jh;
|
|
- struct buffer_head *bh;
|
|
- int locked;
|
|
- int bufs = 0;
|
|
- struct buffer_head **wbuf = journal->j_wbuf;
|
|
+ struct jbd2_inode *jinode, *next_i;
|
|
+ int err, ret = 0;
|
|
|
|
- /*
|
|
- * Whenever we unlock the journal and sleep, things can get added
|
|
- * onto ->t_sync_datalist, so we have to keep looping back to
|
|
- * write_out_data until we *know* that the list is empty.
|
|
- *
|
|
- * Cleanup any flushed data buffers from the data list. Even in
|
|
- * abort mode, we want to flush this out as soon as possible.
|
|
- */
|
|
-write_out_data:
|
|
- cond_resched();
|
|
+ /* For locking, see the comment in journal_submit_data_buffers() */
|
|
spin_lock(&journal->j_list_lock);
|
|
-
|
|
- while (commit_transaction->t_sync_datalist) {
|
|
- jh = commit_transaction->t_sync_datalist;
|
|
- bh = jh2bh(jh);
|
|
- locked = 0;
|
|
-
|
|
- /* Get reference just to make sure buffer does not disappear
|
|
- * when we are forced to drop various locks */
|
|
- get_bh(bh);
|
|
- /* If the buffer is dirty, we need to submit IO and hence
|
|
- * we need the buffer lock. We try to lock the buffer without
|
|
- * blocking. If we fail, we need to drop j_list_lock and do
|
|
- * blocking lock_buffer().
|
|
- */
|
|
- if (buffer_dirty(bh)) {
|
|
- if (test_set_buffer_locked(bh)) {
|
|
- BUFFER_TRACE(bh, "needs blocking lock");
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- /* Write out all data to prevent deadlocks */
|
|
- journal_do_submit_data(wbuf, bufs);
|
|
- bufs = 0;
|
|
- lock_buffer(bh);
|
|
- spin_lock(&journal->j_list_lock);
|
|
- }
|
|
- locked = 1;
|
|
- }
|
|
- /* We have to get bh_state lock. Again out of order, sigh. */
|
|
- if (!inverted_lock(journal, bh)) {
|
|
- jbd_lock_bh_state(bh);
|
|
- spin_lock(&journal->j_list_lock);
|
|
- }
|
|
- /* Someone already cleaned up the buffer? */
|
|
- if (!buffer_jbd(bh)
|
|
- || jh->b_transaction != commit_transaction
|
|
- || jh->b_jlist != BJ_SyncData) {
|
|
- jbd_unlock_bh_state(bh);
|
|
- if (locked)
|
|
- unlock_buffer(bh);
|
|
- BUFFER_TRACE(bh, "already cleaned up");
|
|
- put_bh(bh);
|
|
- continue;
|
|
- }
|
|
- if (locked && test_clear_buffer_dirty(bh)) {
|
|
- BUFFER_TRACE(bh, "needs writeout, adding to array");
|
|
- wbuf[bufs++] = bh;
|
|
- __jbd2_journal_file_buffer(jh, commit_transaction,
|
|
- BJ_Locked);
|
|
- jbd_unlock_bh_state(bh);
|
|
- if (bufs == journal->j_wbufsize) {
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- journal_do_submit_data(wbuf, bufs);
|
|
- bufs = 0;
|
|
- goto write_out_data;
|
|
- }
|
|
- } else if (!locked && buffer_locked(bh)) {
|
|
- __jbd2_journal_file_buffer(jh, commit_transaction,
|
|
- BJ_Locked);
|
|
- jbd_unlock_bh_state(bh);
|
|
- put_bh(bh);
|
|
- } else {
|
|
- BUFFER_TRACE(bh, "writeout complete: unfile");
|
|
- __jbd2_journal_unfile_buffer(jh);
|
|
- jbd_unlock_bh_state(bh);
|
|
- if (locked)
|
|
- unlock_buffer(bh);
|
|
- jbd2_journal_remove_journal_head(bh);
|
|
- /* Once for our safety reference, once for
|
|
- * jbd2_journal_remove_journal_head() */
|
|
- put_bh(bh);
|
|
- put_bh(bh);
|
|
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
|
|
+ jinode->i_flags |= JI_COMMIT_RUNNING;
|
|
+ spin_unlock(&journal->j_list_lock);
|
|
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
|
|
+ if (err) {
|
|
+ /*
|
|
+ * Because AS_EIO is cleared by
|
|
+ * wait_on_page_writeback_range(), set it again so
|
|
+ * that user process can get -EIO from fsync().
|
|
+ */
|
|
+ set_bit(AS_EIO,
|
|
+ &jinode->i_vfs_inode->i_mapping->flags);
|
|
+
|
|
+ if (!ret)
|
|
+ ret = err;
|
|
}
|
|
+ spin_lock(&journal->j_list_lock);
|
|
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
|
|
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
|
|
+ }
|
|
|
|
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- goto write_out_data;
|
|
+ /* Now refile inode to proper lists */
|
|
+ list_for_each_entry_safe(jinode, next_i,
|
|
+ &commit_transaction->t_inode_list, i_list) {
|
|
+ list_del(&jinode->i_list);
|
|
+ if (jinode->i_next_transaction) {
|
|
+ jinode->i_transaction = jinode->i_next_transaction;
|
|
+ jinode->i_next_transaction = NULL;
|
|
+ list_add(&jinode->i_list,
|
|
+ &jinode->i_transaction->t_inode_list);
|
|
+ } else {
|
|
+ jinode->i_transaction = NULL;
|
|
}
|
|
}
|
|
spin_unlock(&journal->j_list_lock);
|
|
- journal_do_submit_data(wbuf, bufs);
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
|
|
@@ -524,21 +469,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
|
|
* Now start flushing things to disk, in the order they appear
|
|
* on the transaction lists. Data blocks go first.
|
|
*/
|
|
- err = 0;
|
|
- journal_submit_data_buffers(journal, commit_transaction);
|
|
-
|
|
- /*
|
|
- * Wait for all previously submitted IO to complete if commit
|
|
- * record is to be written synchronously.
|
|
- */
|
|
- spin_lock(&journal->j_list_lock);
|
|
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
|
|
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
|
|
- err = journal_wait_on_locked_list(journal,
|
|
- commit_transaction);
|
|
-
|
|
- spin_unlock(&journal->j_list_lock);
|
|
-
|
|
+ err = journal_submit_data_buffers(journal, commit_transaction);
|
|
if (err)
|
|
jbd2_journal_abort(journal, err);
|
|
|
|
@@ -547,16 +478,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
|
|
jbd_debug(3, "JBD: commit phase 2\n");
|
|
|
|
/*
|
|
- * If we found any dirty or locked buffers, then we should have
|
|
- * looped back up to the write_out_data label. If there weren't
|
|
- * any then journal_clean_data_list should have wiped the list
|
|
- * clean by now, so check that it is in fact empty.
|
|
- */
|
|
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
|
|
-
|
|
- jbd_debug (3, "JBD: commit phase 3\n");
|
|
-
|
|
- /*
|
|
* Way to go: we have now written out all of the data for a
|
|
* transaction! Now comes the tricky part: we need to write out
|
|
* metadata. Loop over the transaction's entire buffer list:
|
|
@@ -574,6 +495,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
|
|
J_ASSERT(commit_transaction->t_nr_buffers <=
|
|
commit_transaction->t_outstanding_credits);
|
|
|
|
+ err = 0;
|
|
descriptor = NULL;
|
|
bufs = 0;
|
|
while (commit_transaction->t_buffers) {
|
|
@@ -748,13 +670,23 @@ start_journal_io:
|
|
&cbh, crc32_sum);
|
|
if (err)
|
|
__jbd2_journal_abort_hard(journal);
|
|
+ }
|
|
|
|
- spin_lock(&journal->j_list_lock);
|
|
- err = journal_wait_on_locked_list(journal,
|
|
- commit_transaction);
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- if (err)
|
|
- __jbd2_journal_abort_hard(journal);
|
|
+ /*
|
|
+ * This is the right place to wait for data buffers both for ASYNC
|
|
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
|
|
+ * the commit block went to disk (which happens above). If commit is
|
|
+ * SYNC, we need to wait for data buffers before we start writing
|
|
+ * commit block, which happens below in such setting.
|
|
+ */
|
|
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
|
|
+ if (err) {
|
|
+ char b[BDEVNAME_SIZE];
|
|
+
|
|
+ printk(KERN_WARNING
|
|
+ "JBD2: Detected IO errors while flushing file data "
|
|
+ "on %s\n", bdevname(journal->j_fs_dev, b));
|
|
+ err = 0;
|
|
}
|
|
|
|
/* Lo and behold: we have just managed to send a transaction to
|
|
@@ -768,7 +700,7 @@ start_journal_io:
|
|
so we incur less scheduling load.
|
|
*/
|
|
|
|
- jbd_debug(3, "JBD: commit phase 4\n");
|
|
+ jbd_debug(3, "JBD: commit phase 3\n");
|
|
|
|
/*
|
|
* akpm: these are BJ_IO, and j_list_lock is not needed.
|
|
@@ -827,7 +759,7 @@ wait_for_iobuf:
|
|
|
|
J_ASSERT (commit_transaction->t_shadow_list == NULL);
|
|
|
|
- jbd_debug(3, "JBD: commit phase 5\n");
|
|
+ jbd_debug(3, "JBD: commit phase 4\n");
|
|
|
|
/* Here we wait for the revoke record and descriptor record buffers */
|
|
wait_for_ctlbuf:
|
|
@@ -854,7 +786,7 @@ wait_for_iobuf:
|
|
/* AKPM: bforget here */
|
|
}
|
|
|
|
- jbd_debug(3, "JBD: commit phase 6\n");
|
|
+ jbd_debug(3, "JBD: commit phase 5\n");
|
|
|
|
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
|
|
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
|
|
@@ -874,9 +806,9 @@ wait_for_iobuf:
|
|
transaction can be removed from any checkpoint list it was on
|
|
before. */
|
|
|
|
- jbd_debug(3, "JBD: commit phase 7\n");
|
|
+ jbd_debug(3, "JBD: commit phase 6\n");
|
|
|
|
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
|
|
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
|
|
J_ASSERT(commit_transaction->t_buffers == NULL);
|
|
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
|
|
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
|
|
@@ -997,7 +929,7 @@ restart_loop:
|
|
|
|
/* Done with this transaction! */
|
|
|
|
- jbd_debug(3, "JBD: commit phase 8\n");
|
|
+ jbd_debug(3, "JBD: commit phase 7\n");
|
|
|
|
J_ASSERT(commit_transaction->t_state == T_COMMIT);
|
|
|
|
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
|
|
index 2e24567..8207a01 100644
|
|
--- a/fs/jbd2/journal.c
|
|
+++ b/fs/jbd2/journal.c
|
|
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
|
|
EXPORT_SYMBOL(jbd2_journal_get_write_access);
|
|
EXPORT_SYMBOL(jbd2_journal_get_create_access);
|
|
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
|
|
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
|
|
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
|
|
EXPORT_SYMBOL(jbd2_journal_release_buffer);
|
|
EXPORT_SYMBOL(jbd2_journal_forget);
|
|
@@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
|
|
EXPORT_SYMBOL(jbd2_journal_create);
|
|
EXPORT_SYMBOL(jbd2_journal_load);
|
|
EXPORT_SYMBOL(jbd2_journal_destroy);
|
|
-EXPORT_SYMBOL(jbd2_journal_update_superblock);
|
|
EXPORT_SYMBOL(jbd2_journal_abort);
|
|
EXPORT_SYMBOL(jbd2_journal_errno);
|
|
EXPORT_SYMBOL(jbd2_journal_ack_err);
|
|
@@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
|
|
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
|
|
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
|
|
EXPORT_SYMBOL(jbd2_journal_force_commit);
|
|
+EXPORT_SYMBOL(jbd2_journal_file_inode);
|
|
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
|
|
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
|
|
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
|
|
|
|
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
|
|
static void __journal_abort_soft (journal_t *journal, int errno);
|
|
@@ -2195,6 +2197,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
|
|
}
|
|
|
|
/*
|
|
+ * Initialize jbd inode head
|
|
+ */
|
|
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
|
|
+{
|
|
+ jinode->i_transaction = NULL;
|
|
+ jinode->i_next_transaction = NULL;
|
|
+ jinode->i_vfs_inode = inode;
|
|
+ jinode->i_flags = 0;
|
|
+ INIT_LIST_HEAD(&jinode->i_list);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Function to be called before we start removing inode from memory (i.e.,
|
|
+ * clear_inode() is a fine place to be called from). It removes inode from
|
|
+ * transaction's lists.
|
|
+ */
|
|
+void jbd2_journal_release_jbd_inode(journal_t *journal,
|
|
+ struct jbd2_inode *jinode)
|
|
+{
|
|
+ int writeout = 0;
|
|
+
|
|
+ if (!journal)
|
|
+ return;
|
|
+restart:
|
|
+ spin_lock(&journal->j_list_lock);
|
|
+ /* Is commit writing out inode - we have to wait */
|
|
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
|
|
+ wait_queue_head_t *wq;
|
|
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
|
|
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
|
|
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
|
|
+ spin_unlock(&journal->j_list_lock);
|
|
+ schedule();
|
|
+ finish_wait(wq, &wait.wait);
|
|
+ goto restart;
|
|
+ }
|
|
+
|
|
+ /* Do we need to wait for data writeback? */
|
|
+ if (journal->j_committing_transaction == jinode->i_transaction)
|
|
+ writeout = 1;
|
|
+ if (jinode->i_transaction) {
|
|
+ list_del(&jinode->i_list);
|
|
+ jinode->i_transaction = NULL;
|
|
+ }
|
|
+ spin_unlock(&journal->j_list_lock);
|
|
+}
|
|
+
|
|
+/*
|
|
* debugfs tunables
|
|
*/
|
|
#ifdef CONFIG_JBD2_DEBUG
|
|
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
|
|
index d6e006e..4f7cadb 100644
|
|
--- a/fs/jbd2/transaction.c
|
|
+++ b/fs/jbd2/transaction.c
|
|
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
|
|
* new transaction and we can't block without protecting against other
|
|
* processes trying to touch the journal while it is in transition.
|
|
*
|
|
- * Called under j_state_lock
|
|
*/
|
|
|
|
static transaction_t *
|
|
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
|
|
transaction->t_tid = journal->j_transaction_sequence++;
|
|
transaction->t_expires = jiffies + journal->j_commit_interval;
|
|
spin_lock_init(&transaction->t_handle_lock);
|
|
+ INIT_LIST_HEAD(&transaction->t_inode_list);
|
|
|
|
/* Set up the commit timer for the new transaction. */
|
|
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
|
|
@@ -943,183 +943,6 @@ out:
|
|
}
|
|
|
|
/**
|
|
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
|
|
- * needs to be flushed before we can commit the
|
|
- * current transaction.
|
|
- * @handle: transaction
|
|
- * @bh: bufferhead to mark
|
|
- *
|
|
- * The buffer is placed on the transaction's data list and is marked as
|
|
- * belonging to the transaction.
|
|
- *
|
|
- * Returns error number or 0 on success.
|
|
- *
|
|
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
|
|
- * by kswapd.
|
|
- */
|
|
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
|
|
-{
|
|
- journal_t *journal = handle->h_transaction->t_journal;
|
|
- int need_brelse = 0;
|
|
- struct journal_head *jh;
|
|
-
|
|
- if (is_handle_aborted(handle))
|
|
- return 0;
|
|
-
|
|
- jh = jbd2_journal_add_journal_head(bh);
|
|
- JBUFFER_TRACE(jh, "entry");
|
|
-
|
|
- /*
|
|
- * The buffer could *already* be dirty. Writeout can start
|
|
- * at any time.
|
|
- */
|
|
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
|
|
-
|
|
- /*
|
|
- * What if the buffer is already part of a running transaction?
|
|
- *
|
|
- * There are two cases:
|
|
- * 1) It is part of the current running transaction. Refile it,
|
|
- * just in case we have allocated it as metadata, deallocated
|
|
- * it, then reallocated it as data.
|
|
- * 2) It is part of the previous, still-committing transaction.
|
|
- * If all we want to do is to guarantee that the buffer will be
|
|
- * written to disk before this new transaction commits, then
|
|
- * being sure that the *previous* transaction has this same
|
|
- * property is sufficient for us! Just leave it on its old
|
|
- * transaction.
|
|
- *
|
|
- * In case (2), the buffer must not already exist as metadata
|
|
- * --- that would violate write ordering (a transaction is free
|
|
- * to write its data at any point, even before the previous
|
|
- * committing transaction has committed). The caller must
|
|
- * never, ever allow this to happen: there's nothing we can do
|
|
- * about it in this layer.
|
|
- */
|
|
- jbd_lock_bh_state(bh);
|
|
- spin_lock(&journal->j_list_lock);
|
|
-
|
|
- /* Now that we have bh_state locked, are we really still mapped? */
|
|
- if (!buffer_mapped(bh)) {
|
|
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
|
|
- goto no_journal;
|
|
- }
|
|
-
|
|
- if (jh->b_transaction) {
|
|
- JBUFFER_TRACE(jh, "has transaction");
|
|
- if (jh->b_transaction != handle->h_transaction) {
|
|
- JBUFFER_TRACE(jh, "belongs to older transaction");
|
|
- J_ASSERT_JH(jh, jh->b_transaction ==
|
|
- journal->j_committing_transaction);
|
|
-
|
|
- /* @@@ IS THIS TRUE ? */
|
|
- /*
|
|
- * Not any more. Scenario: someone does a write()
|
|
- * in data=journal mode. The buffer's transaction has
|
|
- * moved into commit. Then someone does another
|
|
- * write() to the file. We do the frozen data copyout
|
|
- * and set b_next_transaction to point to j_running_t.
|
|
- * And while we're in that state, someone does a
|
|
- * writepage() in an attempt to pageout the same area
|
|
- * of the file via a shared mapping. At present that
|
|
- * calls jbd2_journal_dirty_data(), and we get right here.
|
|
- * It may be too late to journal the data. Simply
|
|
- * falling through to the next test will suffice: the
|
|
- * data will be dirty and wil be checkpointed. The
|
|
- * ordering comments in the next comment block still
|
|
- * apply.
|
|
- */
|
|
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
|
|
-
|
|
- /*
|
|
- * If we're journalling data, and this buffer was
|
|
- * subject to a write(), it could be metadata, forget
|
|
- * or shadow against the committing transaction. Now,
|
|
- * someone has dirtied the same darn page via a mapping
|
|
- * and it is being writepage()'d.
|
|
- * We *could* just steal the page from commit, with some
|
|
- * fancy locking there. Instead, we just skip it -
|
|
- * don't tie the page's buffers to the new transaction
|
|
- * at all.
|
|
- * Implication: if we crash before the writepage() data
|
|
- * is written into the filesystem, recovery will replay
|
|
- * the write() data.
|
|
- */
|
|
- if (jh->b_jlist != BJ_None &&
|
|
- jh->b_jlist != BJ_SyncData &&
|
|
- jh->b_jlist != BJ_Locked) {
|
|
- JBUFFER_TRACE(jh, "Not stealing");
|
|
- goto no_journal;
|
|
- }
|
|
-
|
|
- /*
|
|
- * This buffer may be undergoing writeout in commit. We
|
|
- * can't return from here and let the caller dirty it
|
|
- * again because that can cause the write-out loop in
|
|
- * commit to never terminate.
|
|
- */
|
|
- if (buffer_dirty(bh)) {
|
|
- get_bh(bh);
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- jbd_unlock_bh_state(bh);
|
|
- need_brelse = 1;
|
|
- sync_dirty_buffer(bh);
|
|
- jbd_lock_bh_state(bh);
|
|
- spin_lock(&journal->j_list_lock);
|
|
- /* Since we dropped the lock... */
|
|
- if (!buffer_mapped(bh)) {
|
|
- JBUFFER_TRACE(jh, "buffer got unmapped");
|
|
- goto no_journal;
|
|
- }
|
|
- /* The buffer may become locked again at any
|
|
- time if it is redirtied */
|
|
- }
|
|
-
|
|
- /* journal_clean_data_list() may have got there first */
|
|
- if (jh->b_transaction != NULL) {
|
|
- JBUFFER_TRACE(jh, "unfile from commit");
|
|
- __jbd2_journal_temp_unlink_buffer(jh);
|
|
- /* It still points to the committing
|
|
- * transaction; move it to this one so
|
|
- * that the refile assert checks are
|
|
- * happy. */
|
|
- jh->b_transaction = handle->h_transaction;
|
|
- }
|
|
- /* The buffer will be refiled below */
|
|
-
|
|
- }
|
|
- /*
|
|
- * Special case --- the buffer might actually have been
|
|
- * allocated and then immediately deallocated in the previous,
|
|
- * committing transaction, so might still be left on that
|
|
- * transaction's metadata lists.
|
|
- */
|
|
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
|
|
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
|
|
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
|
|
- __jbd2_journal_temp_unlink_buffer(jh);
|
|
- jh->b_transaction = handle->h_transaction;
|
|
- JBUFFER_TRACE(jh, "file as data");
|
|
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
|
|
- BJ_SyncData);
|
|
- }
|
|
- } else {
|
|
- JBUFFER_TRACE(jh, "not on a transaction");
|
|
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
|
|
- }
|
|
-no_journal:
|
|
- spin_unlock(&journal->j_list_lock);
|
|
- jbd_unlock_bh_state(bh);
|
|
- if (need_brelse) {
|
|
- BUFFER_TRACE(bh, "brelse");
|
|
- __brelse(bh);
|
|
- }
|
|
- JBUFFER_TRACE(jh, "exit");
|
|
- jbd2_journal_put_journal_head(jh);
|
|
- return 0;
|
|
-}
|
|
-
|
|
-/**
|
|
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
|
|
* @handle: transaction to add buffer to.
|
|
* @bh: buffer to mark
|
|
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
|
|
* Remove a buffer from the appropriate transaction list.
|
|
*
|
|
* Note that this function can *change* the value of
|
|
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
|
|
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
|
|
- * is holding onto a copy of one of thee pointers, it could go bad.
|
|
- * Generally the caller needs to re-read the pointer from the transaction_t.
|
|
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
|
|
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
|
|
+ * of these pointers, it could go bad. Generally the caller needs to re-read
|
|
+ * the pointer from the transaction_t.
|
|
*
|
|
* Called under j_list_lock. The journal may not be locked.
|
|
*/
|
|
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
|
|
switch (jh->b_jlist) {
|
|
case BJ_None:
|
|
return;
|
|
- case BJ_SyncData:
|
|
- list = &transaction->t_sync_datalist;
|
|
- break;
|
|
case BJ_Metadata:
|
|
transaction->t_nr_buffers--;
|
|
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
|
|
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
|
|
case BJ_Reserved:
|
|
list = &transaction->t_reserved_list;
|
|
break;
|
|
- case BJ_Locked:
|
|
- list = &transaction->t_locked_list;
|
|
- break;
|
|
}
|
|
|
|
__blist_del_buffer(list, jh);
|
|
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
|
|
goto out;
|
|
|
|
spin_lock(&journal->j_list_lock);
|
|
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
|
|
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
|
|
- /* A written-back ordered data buffer */
|
|
- JBUFFER_TRACE(jh, "release data");
|
|
- __jbd2_journal_unfile_buffer(jh);
|
|
- jbd2_journal_remove_journal_head(bh);
|
|
- __brelse(bh);
|
|
- }
|
|
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
|
|
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
|
|
/* written-back checkpointed metadata buffer */
|
|
if (jh->b_jlist == BJ_None) {
|
|
JBUFFER_TRACE(jh, "remove from checkpoint list");
|
|
@@ -1656,12 +1465,43 @@ out:
|
|
return;
|
|
}
|
|
|
|
+/*
|
|
+ * jbd2_journal_try_to_free_buffers() could race with
|
|
+ * jbd2_journal_commit_transaction(). The later might still hold the
|
|
+ * reference count to the buffers when inspecting them on
|
|
+ * t_syncdata_list or t_locked_list.
|
|
+ *
|
|
+ * jbd2_journal_try_to_free_buffers() will call this function to
|
|
+ * wait for the current transaction to finish syncing data buffers, before
|
|
+ * try to free that buffer.
|
|
+ *
|
|
+ * Called with journal->j_state_lock hold.
|
|
+ */
|
|
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
|
|
+{
|
|
+ transaction_t *transaction;
|
|
+ tid_t tid;
|
|
+
|
|
+ spin_lock(&journal->j_state_lock);
|
|
+ transaction = journal->j_committing_transaction;
|
|
+
|
|
+ if (!transaction) {
|
|
+ spin_unlock(&journal->j_state_lock);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ tid = transaction->t_tid;
|
|
+ spin_unlock(&journal->j_state_lock);
|
|
+ jbd2_log_wait_commit(journal, tid);
|
|
+}
|
|
|
|
/**
|
|
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
|
|
* @journal: journal for operation
|
|
* @page: to try and free
|
|
- * @unused_gfp_mask: unused
|
|
+ * @gfp_mask: we use the mask to detect how hard should we try to release
|
|
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
|
|
+ * release the buffers.
|
|
*
|
|
*
|
|
* For all the buffers on this page,
|
|
@@ -1690,9 +1530,11 @@ out:
|
|
* journal_try_to_free_buffer() is changing its state. But that
|
|
* cannot happen because we never reallocate freed data as metadata
|
|
* while the data is part of a transaction. Yes?
|
|
+ *
|
|
+ * Return 0 on failure, 1 on success
|
|
*/
|
|
int jbd2_journal_try_to_free_buffers(journal_t *journal,
|
|
- struct page *page, gfp_t unused_gfp_mask)
|
|
+ struct page *page, gfp_t gfp_mask)
|
|
{
|
|
struct buffer_head *head;
|
|
struct buffer_head *bh;
|
|
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
|
|
/*
|
|
* We take our own ref against the journal_head here to avoid
|
|
* having to add tons of locking around each instance of
|
|
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
|
|
+ * jbd2_journal_remove_journal_head() and
|
|
+ * jbd2_journal_put_journal_head().
|
|
*/
|
|
jh = jbd2_journal_grab_journal_head(bh);
|
|
if (!jh)
|
|
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
|
|
if (buffer_jbd(bh))
|
|
goto busy;
|
|
} while ((bh = bh->b_this_page) != head);
|
|
+
|
|
ret = try_to_free_buffers(page);
|
|
+
|
|
+ /*
|
|
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
|
|
+ * could race with jbd2_journal_commit_transaction(), the later still
|
|
+ * holds the reference to the buffers to free while processing them.
|
|
+ * try_to_free_buffers() failed to free those buffers. Some of the
|
|
+ * caller of releasepage() request page buffers to be dropped, otherwise
|
|
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
|
|
+ *
|
|
+ * So, if the caller of try_to_release_page() wants the synchronous
|
|
+ * behaviour(i.e make sure buffers are dropped upon return),
|
|
+ * let's wait for the current transaction to finish flush of
|
|
+ * dirty data buffers, then try to free those buffers again,
|
|
+ * with the journal locked.
|
|
+ */
|
|
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
|
|
+ jbd2_journal_wait_for_transaction_sync_data(journal);
|
|
+ ret = try_to_free_buffers(page);
|
|
+ }
|
|
+
|
|
busy:
|
|
return ret;
|
|
}
|
|
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
|
|
if (!buffer_jbd(bh))
|
|
goto zap_buffer_unlocked;
|
|
|
|
+ /* OK, we have data buffer in journaled mode */
|
|
spin_lock(&journal->j_state_lock);
|
|
jbd_lock_bh_state(bh);
|
|
spin_lock(&journal->j_list_lock);
|
|
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
|
|
}
|
|
} else if (transaction == journal->j_committing_transaction) {
|
|
JBUFFER_TRACE(jh, "on committing transaction");
|
|
- if (jh->b_jlist == BJ_Locked) {
|
|
- /*
|
|
- * The buffer is on the committing transaction's locked
|
|
- * list. We have the buffer locked, so I/O has
|
|
- * completed. So we can nail the buffer now.
|
|
- */
|
|
- may_free = __dispose_buffer(jh, transaction);
|
|
- goto zap_buffer;
|
|
- }
|
|
/*
|
|
* If it is committing, we simply cannot touch it. We
|
|
* can remove it's next_transaction pointer from the
|
|
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
|
|
J_ASSERT_JH(jh, !jh->b_committed_data);
|
|
J_ASSERT_JH(jh, !jh->b_frozen_data);
|
|
return;
|
|
- case BJ_SyncData:
|
|
- list = &transaction->t_sync_datalist;
|
|
- break;
|
|
case BJ_Metadata:
|
|
transaction->t_nr_buffers++;
|
|
list = &transaction->t_buffers;
|
|
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
|
|
case BJ_Reserved:
|
|
list = &transaction->t_reserved_list;
|
|
break;
|
|
- case BJ_Locked:
|
|
- list = &transaction->t_locked_list;
|
|
- break;
|
|
}
|
|
|
|
__blist_add_buffer(list, jh);
|
|
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
|
|
spin_unlock(&journal->j_list_lock);
|
|
__brelse(bh);
|
|
}
|
|
+
|
|
+/*
|
|
+ * File inode in the inode list of the handle's transaction
|
|
+ */
|
|
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
|
|
+{
|
|
+ transaction_t *transaction = handle->h_transaction;
|
|
+ journal_t *journal = transaction->t_journal;
|
|
+
|
|
+ if (is_handle_aborted(handle))
|
|
+ return -EIO;
|
|
+
|
|
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
|
|
+ transaction->t_tid);
|
|
+
|
|
+ /*
|
|
+ * First check whether inode isn't already on the transaction's
|
|
+ * lists without taking the lock. Note that this check is safe
|
|
+ * without the lock as we cannot race with somebody removing inode
|
|
+ * from the transaction. The reason is that we remove inode from the
|
|
+ * transaction only in journal_release_jbd_inode() and when we commit
|
|
+ * the transaction. We are guarded from the first case by holding
|
|
+ * a reference to the inode. We are safe against the second case
|
|
+ * because if jinode->i_transaction == transaction, commit code
|
|
+ * cannot touch the transaction because we hold reference to it,
|
|
+ * and if jinode->i_next_transaction == transaction, commit code
|
|
+ * will only file the inode where we want it.
|
|
+ */
|
|
+ if (jinode->i_transaction == transaction ||
|
|
+ jinode->i_next_transaction == transaction)
|
|
+ return 0;
|
|
+
|
|
+ spin_lock(&journal->j_list_lock);
|
|
+
|
|
+ if (jinode->i_transaction == transaction ||
|
|
+ jinode->i_next_transaction == transaction)
|
|
+ goto done;
|
|
+
|
|
+ /* On some different transaction's list - should be
|
|
+ * the committing one */
|
|
+ if (jinode->i_transaction) {
|
|
+ J_ASSERT(jinode->i_next_transaction == NULL);
|
|
+ J_ASSERT(jinode->i_transaction ==
|
|
+ journal->j_committing_transaction);
|
|
+ jinode->i_next_transaction = transaction;
|
|
+ goto done;
|
|
+ }
|
|
+ /* Not on any transaction list... */
|
|
+ J_ASSERT(!jinode->i_next_transaction);
|
|
+ jinode->i_transaction = transaction;
|
|
+ list_add(&jinode->i_list, &transaction->t_inode_list);
|
|
+done:
|
|
+ spin_unlock(&journal->j_list_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This function must be called when inode is journaled in ordered mode
|
|
+ * before truncation happens. It starts writeout of truncated part in
|
|
+ * case it is in the committing transaction so that we stand to ordered
|
|
+ * mode consistency guarantees.
|
|
+ */
|
|
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
|
|
+ loff_t new_size)
|
|
+{
|
|
+ journal_t *journal;
|
|
+ transaction_t *commit_trans;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!inode->i_transaction && !inode->i_next_transaction)
|
|
+ goto out;
|
|
+ journal = inode->i_transaction->t_journal;
|
|
+ spin_lock(&journal->j_state_lock);
|
|
+ commit_trans = journal->j_committing_transaction;
|
|
+ spin_unlock(&journal->j_state_lock);
|
|
+ if (inode->i_transaction == commit_trans) {
|
|
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
|
|
+ new_size, LLONG_MAX);
|
|
+ if (ret)
|
|
+ jbd2_journal_abort(journal, ret);
|
|
+ }
|
|
+out:
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/mpage.c b/fs/mpage.c
|
|
index 235e4d3..dbcc7af 100644
|
|
--- a/fs/mpage.c
|
|
+++ b/fs/mpage.c
|
|
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
|
|
bio_put(bio);
|
|
}
|
|
|
|
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
|
|
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
|
|
{
|
|
bio->bi_end_io = mpage_end_io_read;
|
|
if (rw == WRITE)
|
|
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
|
|
submit_bio(rw, bio);
|
|
return NULL;
|
|
}
|
|
+EXPORT_SYMBOL(mpage_bio_submit);
|
|
|
|
static struct bio *
|
|
mpage_alloc(struct block_device *bdev,
|
|
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
|
|
* written, so it can intelligently allocate a suitably-sized BIO. For now,
|
|
* just allocate full-size (16-page) BIOs.
|
|
*/
|
|
-struct mpage_data {
|
|
- struct bio *bio;
|
|
- sector_t last_block_in_bio;
|
|
- get_block_t *get_block;
|
|
- unsigned use_writepage;
|
|
-};
|
|
|
|
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
|
|
- void *data)
|
|
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
|
|
+ void *data)
|
|
{
|
|
struct mpage_data *mpd = data;
|
|
struct bio *bio = mpd->bio;
|
|
@@ -651,6 +646,7 @@ out:
|
|
mpd->bio = bio;
|
|
return ret;
|
|
}
|
|
+EXPORT_SYMBOL(__mpage_writepage);
|
|
|
|
/**
|
|
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
|
|
diff --git a/include/linux/fs.h b/include/linux/fs.h
|
|
index d8e2762..97f992a 100644
|
|
--- a/include/linux/fs.h
|
|
+++ b/include/linux/fs.h
|
|
@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
|
|
pgoff_t start, pgoff_t end);
|
|
extern int __filemap_fdatawrite_range(struct address_space *mapping,
|
|
loff_t start, loff_t end, int sync_mode);
|
|
+extern int filemap_fdatawrite_range(struct address_space *mapping,
|
|
+ loff_t start, loff_t end);
|
|
|
|
extern long do_fsync(struct file *file, int datasync);
|
|
extern void sync_supers(void);
|
|
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
|
|
index d147f0f..3dd2090 100644
|
|
--- a/include/linux/jbd2.h
|
|
+++ b/include/linux/jbd2.h
|
|
@@ -168,6 +168,8 @@ struct commit_header {
|
|
unsigned char h_chksum_size;
|
|
unsigned char h_padding[2];
|
|
__be32 h_chksum[JBD2_CHECKSUM_BYTES];
|
|
+ __be64 h_commit_sec;
|
|
+ __be32 h_commit_nsec;
|
|
};
|
|
|
|
/*
|
|
@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
|
|
bit_spin_unlock(BH_JournalHead, &bh->b_state);
|
|
}
|
|
|
|
+/* Flags in jbd_inode->i_flags */
|
|
+#define __JI_COMMIT_RUNNING 0
|
|
+/* Commit of the inode data in progress. We use this flag to protect us from
|
|
+ * concurrent deletion of inode. We cannot use reference to inode for this
|
|
+ * since we cannot afford doing last iput() on behalf of kjournald
|
|
+ */
|
|
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
|
|
+
|
|
+/**
|
|
+ * struct jbd_inode is the structure linking inodes in ordered mode
|
|
+ * present in a transaction so that we can sync them during commit.
|
|
+ */
|
|
+struct jbd2_inode {
|
|
+ /* Which transaction does this inode belong to? Either the running
|
|
+ * transaction or the committing one. [j_list_lock] */
|
|
+ transaction_t *i_transaction;
|
|
+
|
|
+ /* Pointer to the running transaction modifying inode's data in case
|
|
+ * there is already a committing transaction touching it. [j_list_lock] */
|
|
+ transaction_t *i_next_transaction;
|
|
+
|
|
+ /* List of inodes in the i_transaction [j_list_lock] */
|
|
+ struct list_head i_list;
|
|
+
|
|
+ /* VFS inode this inode belongs to [constant during the lifetime
|
|
+ * of the structure] */
|
|
+ struct inode *i_vfs_inode;
|
|
+
|
|
+ /* Flags of inode [j_list_lock] */
|
|
+ unsigned int i_flags;
|
|
+};
|
|
+
|
|
struct jbd2_revoke_table_s;
|
|
|
|
/**
|
|
@@ -509,24 +543,12 @@ struct transaction_s
|
|
struct journal_head *t_reserved_list;
|
|
|
|
/*
|
|
- * Doubly-linked circular list of all buffers under writeout during
|
|
- * commit [j_list_lock]
|
|
- */
|
|
- struct journal_head *t_locked_list;
|
|
-
|
|
- /*
|
|
* Doubly-linked circular list of all metadata buffers owned by this
|
|
* transaction [j_list_lock]
|
|
*/
|
|
struct journal_head *t_buffers;
|
|
|
|
/*
|
|
- * Doubly-linked circular list of all data buffers still to be
|
|
- * flushed before this transaction can be committed [j_list_lock]
|
|
- */
|
|
- struct journal_head *t_sync_datalist;
|
|
-
|
|
- /*
|
|
* Doubly-linked circular list of all forget buffers (superseded
|
|
* buffers which we can un-checkpoint once this transaction commits)
|
|
* [j_list_lock]
|
|
@@ -565,6 +587,12 @@ struct transaction_s
|
|
struct journal_head *t_log_list;
|
|
|
|
/*
|
|
+ * List of inodes whose data we've modified in data=ordered mode.
|
|
+ * [j_list_lock]
|
|
+ */
|
|
+ struct list_head t_inode_list;
|
|
+
|
|
+ /*
|
|
* Protects info related to handles
|
|
*/
|
|
spinlock_t t_handle_lock;
|
|
@@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks);
|
|
extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
|
|
extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
|
|
extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
|
|
-extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
|
|
extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
|
|
extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
|
|
extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
|
|
@@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *);
|
|
extern int jbd2_journal_clear_err (journal_t *);
|
|
extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
|
|
extern int jbd2_journal_force_commit(journal_t *);
|
|
+extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
|
|
+extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
|
|
+extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
|
|
+extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
|
|
|
|
/*
|
|
* journal_head management
|
|
@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
|
|
|
|
/* journaling buffer types */
|
|
#define BJ_None 0 /* Not journaled */
|
|
-#define BJ_SyncData 1 /* Normal data: flush before commit */
|
|
-#define BJ_Metadata 2 /* Normal journaled metadata */
|
|
-#define BJ_Forget 3 /* Buffer superseded by this transaction */
|
|
-#define BJ_IO 4 /* Buffer is for temporary IO use */
|
|
-#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
|
|
-#define BJ_LogCtl 6 /* Buffer contains log descriptors */
|
|
-#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
|
|
-#define BJ_Locked 8 /* Locked for I/O during commit */
|
|
-#define BJ_Types 9
|
|
+#define BJ_Metadata 1 /* Normal journaled metadata */
|
|
+#define BJ_Forget 2 /* Buffer superseded by this transaction */
|
|
+#define BJ_IO 3 /* Buffer is for temporary IO use */
|
|
+#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */
|
|
+#define BJ_LogCtl 5 /* Buffer contains log descriptors */
|
|
+#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
|
|
+#define BJ_Types 7
|
|
|
|
extern int jbd_blocks_per_page(struct inode *inode);
|
|
|
|
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
|
|
index 068a0c9..5c42821 100644
|
|
--- a/include/linux/mpage.h
|
|
+++ b/include/linux/mpage.h
|
|
@@ -11,11 +11,21 @@
|
|
*/
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
+struct mpage_data {
|
|
+ struct bio *bio;
|
|
+ sector_t last_block_in_bio;
|
|
+ get_block_t *get_block;
|
|
+ unsigned use_writepage;
|
|
+};
|
|
+
|
|
struct writeback_control;
|
|
|
|
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
|
|
int mpage_readpages(struct address_space *mapping, struct list_head *pages,
|
|
unsigned nr_pages, get_block_t get_block);
|
|
int mpage_readpage(struct page *page, get_block_t get_block);
|
|
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
|
|
+ void *data);
|
|
int mpage_writepages(struct address_space *mapping,
|
|
struct writeback_control *wbc, get_block_t get_block);
|
|
int mpage_writepage(struct page *page, get_block_t *get_block,
|
|
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
|
|
index 9007ccd..2083888 100644
|
|
--- a/include/linux/percpu_counter.h
|
|
+++ b/include/linux/percpu_counter.h
|
|
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
|
|
void percpu_counter_destroy(struct percpu_counter *fbc);
|
|
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
|
|
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
|
|
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
|
|
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
|
|
|
|
static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
|
|
{
|
|
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
|
|
|
|
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
|
|
{
|
|
- s64 ret = __percpu_counter_sum(fbc);
|
|
+ s64 ret = __percpu_counter_sum(fbc, 0);
|
|
return ret < 0 ? 0 : ret;
|
|
}
|
|
|
|
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
|
|
+{
|
|
+ return __percpu_counter_sum(fbc, 1);
|
|
+}
|
|
+
|
|
+
|
|
static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
|
|
{
|
|
- return __percpu_counter_sum(fbc);
|
|
+ return __percpu_counter_sum(fbc, 0);
|
|
}
|
|
|
|
static inline s64 percpu_counter_read(struct percpu_counter *fbc)
|
|
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
|
|
index f462439..0d8573e 100644
|
|
--- a/include/linux/writeback.h
|
|
+++ b/include/linux/writeback.h
|
|
@@ -63,6 +63,7 @@ struct writeback_control {
|
|
unsigned for_writepages:1; /* This is a writepages() call */
|
|
unsigned range_cyclic:1; /* range_start is cyclic */
|
|
unsigned more_io:1; /* more io to be dispatched */
|
|
+ unsigned range_cont:1;
|
|
};
|
|
|
|
/*
|
|
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
|
|
index 1191744..4a8ba4b 100644
|
|
--- a/lib/percpu_counter.c
|
|
+++ b/lib/percpu_counter.c
|
|
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
|
|
* Add up all the per-cpu counts, return the result. This is a more accurate
|
|
* but much slower version of percpu_counter_read_positive()
|
|
*/
|
|
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
|
|
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
|
|
{
|
|
s64 ret;
|
|
int cpu;
|
|
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
|
|
for_each_online_cpu(cpu) {
|
|
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
|
|
ret += *pcount;
|
|
+ if (set)
|
|
+ *pcount = 0;
|
|
}
|
|
+ if (set)
|
|
+ fbc->count = ret;
|
|
+
|
|
spin_unlock(&fbc->lock);
|
|
return ret;
|
|
}
|
|
diff --git a/mm/filemap.c b/mm/filemap.c
|
|
index 1e6a7d3..65d9d9e 100644
|
|
--- a/mm/filemap.c
|
|
+++ b/mm/filemap.c
|
|
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
|
|
}
|
|
EXPORT_SYMBOL(filemap_fdatawrite);
|
|
|
|
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
|
|
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
|
|
loff_t end)
|
|
{
|
|
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
|
|
}
|
|
+EXPORT_SYMBOL(filemap_fdatawrite_range);
|
|
|
|
/**
|
|
* filemap_flush - mostly a non-blocking flush
|
|
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
|
|
index 789b6ad..ded57d5 100644
|
|
--- a/mm/page-writeback.c
|
|
+++ b/mm/page-writeback.c
|
|
@@ -956,6 +956,9 @@ retry:
|
|
}
|
|
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
|
|
mapping->writeback_index = index;
|
|
+
|
|
+ if (wbc->range_cont)
|
|
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(write_cache_pages);
|