Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Implement allocation size ranges and use for gang leaves
When forced to resort to ganging, ZFS currently allocates three child
blocks, each one third of the size of the original. This is true
regardless of whether larger allocations could be made, which would
allow us to have fewer gang leaves. This improves performance when
fragmentation is high enough to require ganging, but not so high that
all the free ranges are only just big enough to hold a third of the
recordsize. This is also useful for improving the behavior of a future
change to allow larger gang headers.

We add the ability for the allocation codepath to allocate a range of
sizes instead of a single fixed size. We then use this to pre-allocate
the DVAs for the gang children. If those allocations fail, we fall back
to the normal write path, which will likely re-gang.

Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
  • Loading branch information
pcd1193182 committed Apr 23, 2025
commit 5605513384147e1b2894d524511022001a6a948e
8 changes: 7 additions & 1 deletion include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ extern "C" {

typedef struct metaslab_ops {
const char *msop_name;
uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *);
} metaslab_ops_t;


Expand Down Expand Up @@ -82,6 +82,9 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);

int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int,
uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *);
int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *,
int, const void *, uint64_t *);
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
Expand All @@ -95,6 +98,7 @@ void metaslab_check_free(spa_t *, const blkptr_t *);

void metaslab_stat_init(void);
void metaslab_stat_fini(void);
void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *);
void metaslab_trace_init(zio_alloc_list_t *);
void metaslab_trace_fini(zio_alloc_list_t *);

Expand Down Expand Up @@ -127,6 +131,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int,
uint64_t, const void *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t,
const void *);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ extern void vdev_space_update(vdev_t *vd,

extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);

extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize,
uint64_t txg);
extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
uint64_t txg);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
Expand Down
2 changes: 1 addition & 1 deletion include/sys/vdev_draid.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **);
*/
extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t);
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t);
extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *);
extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *);
extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);
Expand Down
4 changes: 3 additions & 1 deletion include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ typedef const struct vdev_ops {
vdev_fini_func_t *vdev_op_fini;
vdev_open_func_t *vdev_op_open;
vdev_close_func_t *vdev_op_close;
vdev_asize_func_t *vdev_op_asize;
vdev_asize_func_t *vdev_op_psize_to_asize;
vdev_asize_func_t *vdev_op_asize_to_psize;
vdev_min_asize_func_t *vdev_op_min_asize;
vdev_min_alloc_func_t *vdev_op_min_alloc;
vdev_io_start_func_t *vdev_op_io_start;
Expand Down Expand Up @@ -615,6 +616,7 @@ extern vdev_ops_t vdev_indirect_ops;
*/
extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
extern uint64_t vdev_default_min_asize(vdev_t *vd);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
Expand Down
1 change: 1 addition & 0 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
#define ZIO_FLAG_DELEGATED (1ULL << 31)
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32)
#define ZIO_FLAG_PREALLOCATED (1ULL << 33)

#define ZIO_ALLOCATOR_NONE (-1)
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
Expand Down
32 changes: 16 additions & 16 deletions lib/libzfs/libzfs_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -5416,12 +5416,12 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
* +-------+-------+-------+-------+-------+
*
* Above, notice that the 4k block required one sector for parity and another
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
* and free properties will be adjusted by 8k. The dataset will not be charged
* 8k. Rather, it will be charged a value that is scaled according to the
* overhead of the 128k block on the same vdev. This 8k allocation will be
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
* calculated in the 128k block example above.
* for data. vdev_raidz_psize_to_asize() will return 8k and as such the pool's
* allocated and free properties will be adjusted by 8k. The dataset will not
* be charged 8k. Rather, it will be charged a value that is scaled according
* to the overhead of the 128k block on the same vdev. This 8k allocation will
* be charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is
* as calculated in the 128k block example above.
*
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
Expand Down Expand Up @@ -5468,7 +5468,7 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
* not necessarily equal to "blksize", due to RAIDZ deflation.
*/
static uint64_t
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
uint64_t blksize)
{
uint64_t asize, ndata;
Expand All @@ -5488,7 +5488,7 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
* size.
*/
static uint64_t
vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
uint64_t blksize)
{
ASSERT3U(ndisks, >, nparity);
Expand Down Expand Up @@ -5548,12 +5548,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
continue;

/* allocation size for the "typical" 128k block */
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
SPA_OLD_MAXBLOCKSIZE);
tsize = vdev_raidz_psize_to_asize(ndisks, nparity,
ashift, SPA_OLD_MAXBLOCKSIZE);

/* allocation size for the blksize block */
asize = vdev_raidz_asize(ndisks, nparity, ashift,
blksize);
asize = vdev_raidz_psize_to_asize(ndisks, nparity,
ashift, blksize);
} else {
uint64_t ndata;

Expand All @@ -5562,12 +5562,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
continue;

/* allocation size for the "typical" 128k block */
tsize = vdev_draid_asize(ndata + nparity, nparity,
ashift, SPA_OLD_MAXBLOCKSIZE);
tsize = vdev_draid_psize_to_asize(ndata + nparity,
nparity, ashift, SPA_OLD_MAXBLOCKSIZE);

/* allocation size for the blksize block */
asize = vdev_draid_asize(ndata + nparity, nparity,
ashift, blksize);
asize = vdev_draid_psize_to_asize(ndata + nparity,
nparity, ashift, blksize);
}

/*
Expand Down
3 changes: 2 additions & 1 deletion module/os/freebsd/zfs/vdev_geom.c
Original file line number Diff line number Diff line change
Expand Up @@ -1276,7 +1276,8 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_fini = NULL,
.vdev_op_open = vdev_geom_open,
.vdev_op_close = vdev_geom_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_psize_to_asize = vdev_default_asize,
.vdev_op_asize_to_psize = vdev_default_psize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_geom_io_start,
Expand Down
3 changes: 2 additions & 1 deletion module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -1554,7 +1554,8 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_asize_to_psize = vdev_default_psize,
.vdev_op_psize_to_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_disk_io_start,
Expand Down
Loading
Loading