NAK: [PATCH 3/3 Xenial SRU] ext4: more efficient SEEK_DATA implementation

Kamal Mostafa kamal at canonical.com
Tue Jul 19 18:15:22 UTC 2016


Fails to build in xenial:

  CC      fs/ext4/file.o
fs/ext4/file.c: In function ‘ext4_seek_data’:
fs/ext4/file.c:597:4: error: implicit declaration of function ‘inode_unlock’ [-Werror=implicit-function-declaration]
    inode_unlock(inode);
    ^

(That doesn't get introduced until v4.5 by
"5955102 wrappers for ->i_mutex access")

 -Kamal

On Mon, Jul 18, 2016 at 08:38:29AM -0600, Tim Gardner wrote:
> From: Jan Kara <jack at suse.cz>
> 
> BugLink: http://bugs.launchpad.net/bugs/1602524
> 
> Using SEEK_DATA in a huge sparse file can easily lead to sotflockups as
> ext4_seek_data() iterates hole block-by-block. Fix the problem by using
> returned hole size from ext4_map_blocks() and thus skip the hole in one
> go.
> 
> Update also SEEK_HOLE implementation to follow the same pattern as
> SEEK_DATA to make future maintenance easier.
> 
> Furthermore we add cond_resched() to both ext4_seek_data() and
> ext4_seek_hole() to avoid softlockups in case evil user creates huge
> fragmented file and we have to go through lots of extents.
> 
> Signed-off-by: Jan Kara <jack at suse.cz>
> Signed-off-by: Theodore Ts'o <tytso at mit.edu>
> (back ported from commit 2d90c160e5f1d784e180f1e1458d56eee4d7f4f4)
> Signed-off-by: Tim Gardner <tim.gardner at canonical.com>
> 
> Conflicts:
> 	fs/ext4/ext4.h
> ---
>  fs/ext4/ext4.h  |  5 +++
>  fs/ext4/file.c  | 97 +++++++++++++++++++++------------------------------------
>  fs/ext4/inode.c | 67 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 108 insertions(+), 61 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index b7e921d..6148573 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2520,6 +2520,11 @@ extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
>  extern qsize_t *ext4_get_reserved_space(struct inode *inode);
>  extern void ext4_da_update_reserve_space(struct inode *inode,
>  					int used, int quota_claim);
> +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
> +			      ext4_fsblk_t pblk, ext4_lblk_t len);
> +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
> +				unsigned int map_len,
> +				struct extent_status *result);
>  
>  /* indirect.c */
>  extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 0d24ebc..2753bed 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -435,7 +435,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
>   */
>  static int ext4_find_unwritten_pgoff(struct inode *inode,
>  				     int whence,
> -				     struct ext4_map_blocks *map,
> +				     ext4_lblk_t end_blk,
>  				     loff_t *offset)
>  {
>  	struct pagevec pvec;
> @@ -450,7 +450,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
>  	blkbits = inode->i_sb->s_blocksize_bits;
>  	startoff = *offset;
>  	lastoff = startoff;
> -	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
> +	endoff = (loff_t)end_blk << blkbits;
>  
>  	index = startoff >> PAGE_CACHE_SHIFT;
>  	end = endoff >> PAGE_CACHE_SHIFT;
> @@ -568,12 +568,11 @@ out:
>  static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
>  {
>  	struct inode *inode = file->f_mapping->host;
> -	struct ext4_map_blocks map;
>  	struct extent_status es;
>  	ext4_lblk_t start, last, end;
>  	loff_t dataoff, isize;
>  	int blkbits;
> -	int ret = 0;
> +	int ret;
>  
>  	mutex_lock(&inode->i_mutex);
>  
> @@ -590,41 +589,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
>  	dataoff = offset;
>  
>  	do {
> -		map.m_lblk = last;
> -		map.m_len = end - last + 1;
> -		ret = ext4_map_blocks(NULL, inode, &map, 0);
> -		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
> -			if (last != start)
> -				dataoff = (loff_t)last << blkbits;
> -			break;
> +		ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
> +		if (ret <= 0) {
> +			/* No extent found -> no data */
> +			if (ret == 0)
> +				ret = -ENXIO;
> +			inode_unlock(inode);
> +			return ret;
>  		}
>  
> -		/*
> -		 * If there is a delay extent at this offset,
> -		 * it will be as a data.
> -		 */
> -		ext4_es_find_delayed_extent_range(inode, last, last, &es);
> -		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
> -			if (last != start)
> -				dataoff = (loff_t)last << blkbits;
> +		last = es.es_lblk;
> +		if (last != start)
> +			dataoff = (loff_t)last << blkbits;
> +		if (!ext4_es_is_unwritten(&es))
>  			break;
> -		}
>  
>  		/*
>  		 * If there is a unwritten extent at this offset,
>  		 * it will be as a data or a hole according to page
>  		 * cache that has data or not.
>  		 */
> -		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> -			int unwritten;
> -			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
> -							      &map, &dataoff);
> -			if (unwritten)
> -				break;
> -		}
> -
> -		last++;
> +		if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
> +					      es.es_lblk + es.es_len, &dataoff))
> +			break;
> +		last += es.es_len;
>  		dataoff = (loff_t)last << blkbits;
> +		cond_resched();
>  	} while (last <= end);
>  
>  	mutex_unlock(&inode->i_mutex);
> @@ -641,12 +631,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
>  static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
>  {
>  	struct inode *inode = file->f_mapping->host;
> -	struct ext4_map_blocks map;
>  	struct extent_status es;
>  	ext4_lblk_t start, last, end;
>  	loff_t holeoff, isize;
>  	int blkbits;
> -	int ret = 0;
> +	int ret;
>  
>  	mutex_lock(&inode->i_mutex);
>  
> @@ -663,44 +652,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
>  	holeoff = offset;
>  
>  	do {
> -		map.m_lblk = last;
> -		map.m_len = end - last + 1;
> -		ret = ext4_map_blocks(NULL, inode, &map, 0);
> -		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
> -			last += ret;
> -			holeoff = (loff_t)last << blkbits;
> -			continue;
> +		ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
> +		if (ret < 0) {
> +			inode_unlock(inode);
> +			return ret;
>  		}
> -
> -		/*
> -		 * If there is a delay extent at this offset,
> -		 * we will skip this extent.
> -		 */
> -		ext4_es_find_delayed_extent_range(inode, last, last, &es);
> -		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
> -			last = es.es_lblk + es.es_len;
> -			holeoff = (loff_t)last << blkbits;
> -			continue;
> +		/* Found a hole? */
> +		if (ret == 0 || es.es_lblk > last) {
> +			if (last != start)
> +				holeoff = (loff_t)last << blkbits;
> +			break;
>  		}
> -
>  		/*
>  		 * If there is a unwritten extent at this offset,
>  		 * it will be as a data or a hole according to page
>  		 * cache that has data or not.
>  		 */
> -		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> -			int unwritten;
> -			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
> -							      &map, &holeoff);
> -			if (!unwritten) {
> -				last += ret;
> -				holeoff = (loff_t)last << blkbits;
> -				continue;
> -			}
> -		}
> +		if (ext4_es_is_unwritten(&es) &&
> +		    ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
> +					      last + es.es_len, &holeoff))
> +			break;
>  
> -		/* find a hole */
> -		break;
> +		last += es.es_len;
> +		holeoff = (loff_t)last << blkbits;
> +		cond_resched();
>  	} while (last <= end);
>  
>  	mutex_unlock(&inode->i_mutex);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f2736bc..42658bc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5434,3 +5434,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  
>  	return err;
>  }
> +
> +/*
> + * Find the first extent at or after @lblk in an inode that is not a hole.
> + * Search for @map_len blocks at most. The extent is returned in @result.
> + *
> + * The function returns 1 if we found an extent. The function returns 0 in
> + * case there is no extent at or after @lblk and in that case also sets
> + * @result->es_len to 0. In case of error, the error code is returned.
> + */
> +int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
> +			 unsigned int map_len, struct extent_status *result)
> +{
> +	struct ext4_map_blocks map;
> +	struct extent_status es = {};
> +	int ret;
> +
> +	map.m_lblk = lblk;
> +	map.m_len = map_len;
> +
> +	/*
> +	 * For non-extent based files this loop may iterate several times since
> +	 * we do not determine full hole size.
> +	 */
> +	while (map.m_len > 0) {
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0)
> +			return ret;
> +		/* There's extent covering m_lblk? Just return it. */
> +		if (ret > 0) {
> +			int status;
> +
> +			ext4_es_store_pblock(result, map.m_pblk);
> +			result->es_lblk = map.m_lblk;
> +			result->es_len = map.m_len;
> +			if (map.m_flags & EXT4_MAP_UNWRITTEN)
> +				status = EXTENT_STATUS_UNWRITTEN;
> +			else
> +				status = EXTENT_STATUS_WRITTEN;
> +			ext4_es_store_status(result, status);
> +			return 1;
> +		}
> +		ext4_es_find_delayed_extent_range(inode, map.m_lblk,
> +						  map.m_lblk + map.m_len - 1,
> +						  &es);
> +		/* Is delalloc data before next block in extent tree? */
> +		if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
> +			ext4_lblk_t offset = 0;
> +
> +			if (es.es_lblk < lblk)
> +				offset = lblk - es.es_lblk;
> +			result->es_lblk = es.es_lblk + offset;
> +			ext4_es_store_pblock(result,
> +					     ext4_es_pblock(&es) + offset);
> +			result->es_len = es.es_len - offset;
> +			ext4_es_store_status(result, ext4_es_status(&es));
> +
> +			return 1;
> +		}
> +		/* There's a hole at m_lblk, advance us after it */
> +		map.m_lblk += map.m_len;
> +		map_len -= map.m_len;
> +		map.m_len = map_len;
> +		cond_resched();
> +	}
> +	result->es_len = 0;
> +	return 0;
> +}
> -- 
> 1.9.1
> 
> 
> -- 
> kernel-team mailing list
> kernel-team at lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team




More information about the kernel-team mailing list