Skip to content

Commit a7e8aae

Browse files
keithbuschvincentkfu
authored andcommitted
fio: add fdp support for io_uring_cmd nvme engine
Add support for NVMe TP4146 Flexible Data Placemen, allowing placement identifiers in write commands. The user can enabled this with the new "fdp=1" parameter for fio's io_uring_cmd ioengine. By default, the fio jobs will cycle through all the namespace's available placement identifiers for write commands. The user can limit which placement identifiers can be used with additional parameter, "fdp_pli=<list,>", which can be used to separate write intensive jobs from less intensive ones. Setting up your namespace for FDP is outside the scope of 'fio', so this assumes the namespace is already properly configured for the mode. Link: https://lore.kernel.org/fio/CAKi7+wfX-eaUD5pky5cJ824uCzsQ4sPYMZdp3AuCUZOA1TQrYw@mail.gmail.com/T/#m056018eb07229bed00d4e589f9760b2a2aa009fc Based-on-a-patch-by: Ankit Kumar <[email protected]> Signed-off-by: Keith Busch <[email protected]> Reviewed-by: Damien Le Moal <[email protected]> [Vincent: fold in sfree fix from Ankit] Signed-off-by: Vincent Fu <[email protected]>
1 parent d08dbc0 commit a7e8aae

18 files changed

+366
-4
lines changed

Diff for: HOWTO.rst

+12
Original file line numberDiff line numberDiff line change
@@ -2423,6 +2423,18 @@ with the caveat that when used on the command line, they must come after the
24232423
For direct I/O, requests will only succeed if cache invalidation isn't required,
24242424
file blocks are fully allocated and the disk request could be issued immediately.
24252425

2426+
.. option:: fdp=bool : [io_uring_cmd]
2427+
2428+
Enable Flexible Data Placement mode for write commands.
2429+
2430+
.. option:: fdp_pli=str : [io_uring_cmd]
2431+
2432+
Select which Placement ID Index/Indicies this job is allowed to use for
2433+
writes. By default, the job will cycle through all available Placement
2434+
IDs, so use this to isolate these identifiers to specific jobs. If you
2435+
want fio to use placement identifier only at indices 0, 2 and 5 specify
2436+
``fdp_pli=0,2,5``.
2437+
24262438
.. option:: cpuload=int : [cpuio]
24272439

24282440
Attempt to use the specified percentage of CPU cycles. This is a mandatory

Diff for: Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
6262
gettime-thread.c helpers.c json.c idletime.c td_error.c \
6363
profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
6464
workqueue.c rate-submit.c optgroup.c helper_thread.c \
65-
steadystate.c zone-dist.c zbd.c dedupe.c
65+
steadystate.c zone-dist.c zbd.c dedupe.c fdp.c
6666

6767
ifdef CONFIG_LIBHDFS
6868
HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)

Diff for: cconv.c

+10
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,11 @@ int convert_thread_options_to_cpu(struct thread_options *o,
349349

350350
for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
351351
o->merge_blktrace_iters[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_iters[i].u.i));
352+
353+
o->fdp = le32_to_cpu(top->fdp);
354+
o->fdp_nrpli = le32_to_cpu(top->fdp_nrpli);
355+
for (i = 0; i < o->fdp_nrpli; i++)
356+
o->fdp_plis[i] = le32_to_cpu(top->fdp_plis[i]);
352357
#if 0
353358
uint8_t cpumask[FIO_TOP_STR_MAX];
354359
uint8_t verify_cpumask[FIO_TOP_STR_MAX];
@@ -638,6 +643,11 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
638643

639644
for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
640645
top->merge_blktrace_iters[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_iters[i].u.f));
646+
647+
top->fdp = cpu_to_le32(o->fdp);
648+
top->fdp_nrpli = cpu_to_le32(o->fdp_nrpli);
649+
for (i = 0; i < o->fdp_nrpli; i++)
650+
top->fdp_plis[i] = cpu_to_le32(o->fdp_plis[i]);
641651
#if 0
642652
uint8_t cpumask[FIO_TOP_STR_MAX];
643653
uint8_t verify_cpumask[FIO_TOP_STR_MAX];

Diff for: engines/io_uring.c

+24
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,29 @@ static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
12621262
return fio_nvme_get_max_open_zones(td, f, max_open_zones);
12631263
}
12641264

1265+
static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1266+
struct fio_ruhs_info *fruhs_info)
1267+
{
1268+
struct nvme_fdp_ruh_status *ruhs;
1269+
int bytes, ret, i;
1270+
1271+
bytes = sizeof(*ruhs) + 128 * sizeof(struct nvme_fdp_ruh_status_desc);
1272+
ruhs = scalloc(1, bytes);
1273+
if (!ruhs)
1274+
return -ENOMEM;
1275+
1276+
ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes);
1277+
if (ret)
1278+
goto free;
1279+
1280+
fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
1281+
for (i = 0; i < fruhs_info->nr_ruhs; i++)
1282+
fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
1283+
free:
1284+
sfree(ruhs);
1285+
return ret;
1286+
}
1287+
12651288
static struct ioengine_ops ioengine_uring = {
12661289
.name = "io_uring",
12671290
.version = FIO_IOOPS_VERSION,
@@ -1307,6 +1330,7 @@ static struct ioengine_ops ioengine_uring_cmd = {
13071330
.get_max_open_zones = fio_ioring_cmd_get_max_open_zones,
13081331
.options = options,
13091332
.option_struct_size = sizeof(struct ioring_options),
1333+
.fdp_fetch_ruhs = fio_ioring_cmd_fetch_ruhs,
13101334
};
13111335

13121336
static void fio_init fio_ioring_register(void)

Diff for: engines/nvme.c

+39-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
2828
cmd->cdw10 = slba & 0xffffffff;
2929
cmd->cdw11 = slba >> 32;
3030
/* cdw12 represent number of lba's for read/write */
31-
cmd->cdw12 = nlb;
31+
cmd->cdw12 = nlb | (io_u->dtype << 20);
32+
cmd->cdw13 = io_u->dspec << 16;
3233
if (iov) {
3334
iov->iov_base = io_u->xfer_buf;
3435
iov->iov_len = io_u->xfer_buflen;
@@ -345,3 +346,40 @@ int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
345346
close(fd);
346347
return ret;
347348
}
349+
350+
static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
351+
__u32 data_len, void *data)
352+
{
353+
struct nvme_passthru_cmd cmd = {
354+
.opcode = nvme_cmd_io_mgmt_recv,
355+
.nsid = nsid,
356+
.addr = (__u64)(uintptr_t)data,
357+
.data_len = data_len,
358+
.cdw10 = 1,
359+
.cdw11 = (data_len >> 2) - 1,
360+
};
361+
362+
return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
363+
}
364+
365+
int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
366+
struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
367+
{
368+
struct nvme_data *data = FILE_ENG_DATA(f);
369+
int fd, ret;
370+
371+
fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
372+
if (fd < 0)
373+
return -errno;
374+
375+
ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
376+
if (ret) {
377+
log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n",
378+
f->file_name, ret);
379+
errno = ENOTSUP;
380+
} else
381+
errno = 0;
382+
383+
close(fd);
384+
return -errno;
385+
}

Diff for: engines/nvme.h

+18
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ enum nvme_admin_opcode {
6767
enum nvme_io_opcode {
6868
nvme_cmd_write = 0x01,
6969
nvme_cmd_read = 0x02,
70+
nvme_cmd_io_mgmt_recv = 0x12,
7071
nvme_zns_cmd_mgmt_send = 0x79,
7172
nvme_zns_cmd_mgmt_recv = 0x7a,
7273
};
@@ -192,6 +193,23 @@ struct nvme_zone_report {
192193
struct nvme_zns_desc entries[];
193194
};
194195

196+
struct nvme_fdp_ruh_status_desc {
197+
__u16 pid;
198+
__u16 ruhid;
199+
__u32 earutr;
200+
__u64 ruamw;
201+
__u8 rsvd16[16];
202+
};
203+
204+
struct nvme_fdp_ruh_status {
205+
__u8 rsvd0[14];
206+
__le16 nruhsd;
207+
struct nvme_fdp_ruh_status_desc ruhss[];
208+
};
209+
210+
int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
211+
struct nvme_fdp_ruh_status *ruhs, __u32 bytes);
212+
195213
int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz,
196214
__u64 *nlba);
197215

Diff for: examples/uring-cmd-fdp.fio

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled
2+
# This assumes the namespace is already configured with FDP support and has at
3+
# least 8 available reclaim units.
4+
#
5+
# Each job targets different ranges of LBAs with different placement
6+
# identifiers, and has different write intensity.
7+
8+
[global]
9+
filename=/dev/ng0n1
10+
ioengine=io_uring_cmd
11+
cmd_type=nvme
12+
iodepth=32
13+
bs=4K
14+
fdp=1
15+
time_based=1
16+
runtime=1000
17+
18+
[write-heavy]
19+
rw=randrw
20+
rwmixwrite=90
21+
fdp_pli=0,1,2,3
22+
offset=0%
23+
size=30%
24+
25+
[write-mid]
26+
rw=randrw
27+
rwmixwrite=30
28+
fdp_pli=4,5
29+
offset=30%
30+
size=30%
31+
32+
[write-light]
33+
rw=randrw
34+
rwmixwrite=10
35+
fdp_pli=6
36+
offset=60%
37+
size=30%

Diff for: fdp.c

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/*
2+
* Note: This is similar to a very basic setup
3+
* of ZBD devices
4+
*
5+
* Specify fdp=1 (With char devices /dev/ng0n1)
6+
*/
7+
8+
#include <errno.h>
9+
#include <string.h>
10+
#include <stdlib.h>
11+
#include <unistd.h>
12+
#include "file.h"
13+
#include "fio.h"
14+
15+
#include "pshared.h"
16+
#include "fdp.h"
17+
18+
static int fdp_ruh_info(struct thread_data *td, struct fio_file *f,
19+
struct fio_ruhs_info *ruhs)
20+
{
21+
int ret = -EINVAL;
22+
23+
if (td->io_ops && td->io_ops->fdp_fetch_ruhs) {
24+
ret = td->io_ops->fdp_fetch_ruhs(td, f, ruhs);
25+
if (ret < 0) {
26+
td_verror(td, errno, "fdp fetch ruhs failed");
27+
log_err("%s: fdp fetch ruhs failed (%d)\n",
28+
f->file_name, errno);
29+
}
30+
} else
31+
log_err("%s: engine (%s) lacks fetch ruhs\n",
32+
f->file_name, td->io_ops->name);
33+
34+
return ret;
35+
}
36+
37+
static int init_ruh_info(struct thread_data *td, struct fio_file *f)
38+
{
39+
struct fio_ruhs_info *ruhs, *tmp;
40+
int i, ret;
41+
42+
ruhs = scalloc(1, sizeof(*ruhs) + 128 * sizeof(*ruhs->plis));
43+
if (!ruhs)
44+
return -ENOMEM;
45+
46+
ret = fdp_ruh_info(td, f, ruhs);
47+
if (ret) {
48+
log_info("fio: ruh info failed for %s (%d)\n",
49+
f->file_name, -ret);
50+
goto out;
51+
}
52+
53+
if (ruhs->nr_ruhs > 128)
54+
ruhs->nr_ruhs = 128;
55+
56+
if (td->o.fdp_nrpli == 0) {
57+
f->ruhs_info = ruhs;
58+
return 0;
59+
}
60+
61+
for (i = 0; i < td->o.fdp_nrpli; i++) {
62+
if (td->o.fdp_plis[i] > ruhs->nr_ruhs) {
63+
ret = -EINVAL;
64+
goto out;
65+
}
66+
}
67+
68+
tmp = scalloc(1, sizeof(*tmp) + ruhs->nr_ruhs * sizeof(*tmp->plis));
69+
if (!tmp) {
70+
ret = -ENOMEM;
71+
goto out;
72+
}
73+
74+
tmp->nr_ruhs = td->o.fdp_nrpli;
75+
for (i = 0; i < td->o.fdp_nrpli; i++)
76+
tmp->plis[i] = ruhs->plis[td->o.fdp_plis[i]];
77+
f->ruhs_info = tmp;
78+
out:
79+
sfree(ruhs);
80+
return ret;
81+
}
82+
83+
int fdp_init(struct thread_data *td)
84+
{
85+
struct fio_file *f;
86+
int i, ret = 0;
87+
88+
for_each_file(td, f, i) {
89+
ret = init_ruh_info(td, f);
90+
if (ret)
91+
break;
92+
}
93+
return ret;
94+
}
95+
96+
void fdp_free_ruhs_info(struct fio_file *f)
97+
{
98+
if (!f->ruhs_info)
99+
return;
100+
sfree(f->ruhs_info);
101+
f->ruhs_info = NULL;
102+
}
103+
104+
void fdp_fill_dspec_data(struct thread_data *td, struct io_u *io_u)
105+
{
106+
struct fio_file *f = io_u->file;
107+
struct fio_ruhs_info *ruhs = f->ruhs_info;
108+
int dspec;
109+
110+
if (!ruhs || io_u->ddir != DDIR_WRITE) {
111+
io_u->dtype = 0;
112+
io_u->dspec = 0;
113+
return;
114+
}
115+
116+
dspec = ruhs->plis[ruhs->pli_loc++ % ruhs->nr_ruhs];
117+
io_u->dtype = 2;
118+
io_u->dspec = dspec;
119+
}

Diff for: fdp.h

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef FIO_FDP_H
2+
#define FIO_FDP_H
3+
4+
#include "io_u.h"
5+
6+
struct fio_ruhs_info {
7+
uint32_t nr_ruhs;
8+
uint32_t pli_loc;
9+
uint16_t plis[];
10+
};
11+
12+
int fdp_init(struct thread_data *td);
13+
void fdp_free_ruhs_info(struct fio_file *f);
14+
void fdp_fill_dspec_data(struct thread_data *td, struct io_u *io_u);
15+
16+
#endif /* FIO_FDP_H */

Diff for: file.h

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
/* Forward declarations */
1414
struct zoned_block_device_info;
15+
struct fdp_ruh_info;
1516

1617
/*
1718
* The type of object we are working on
@@ -101,6 +102,8 @@ struct fio_file {
101102
uint64_t file_offset;
102103
uint64_t io_size;
103104

105+
struct fio_ruhs_info *ruhs_info;
106+
104107
/*
105108
* Zoned block device information. See also zonemode=zbd.
106109
*/

Diff for: filesetup.c

+9
Original file line numberDiff line numberDiff line change
@@ -1407,6 +1407,12 @@ int setup_files(struct thread_data *td)
14071407

14081408
td_restore_runstate(td, old_state);
14091409

1410+
if (td->o.fdp) {
1411+
err = fdp_init(td);
1412+
if (err)
1413+
goto err_out;
1414+
}
1415+
14101416
return 0;
14111417

14121418
err_offset:
@@ -1584,6 +1590,8 @@ void fio_file_free(struct fio_file *f)
15841590
{
15851591
if (fio_file_axmap(f))
15861592
axmap_free(f->io_axmap);
1593+
if (f->ruhs_info)
1594+
sfree(f->ruhs_info);
15871595
if (!fio_file_smalloc(f)) {
15881596
free(f->file_name);
15891597
free(f);
@@ -1617,6 +1625,7 @@ void close_and_free_files(struct thread_data *td)
16171625
}
16181626

16191627
zbd_close_file(f);
1628+
fdp_free_ruhs_info(f);
16201629
fio_file_free(f);
16211630
}
16221631

0 commit comments

Comments
 (0)