Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi-file support for XRP #1

Open
wants to merge 7 commits into
base: xrp
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions arch/x86/entry/syscalls/syscall_32.tbl
Original file line number Diff line number Diff line change
Expand Up @@ -447,3 +447,6 @@
440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 i386 mount_setattr sys_mount_setattr
444 i386 print_xrp_stats sys_print_xrp_stats
445 i386 read_xrp sys_read_xrp
446 i386 test_xrp sys_test_xrp
2 changes: 1 addition & 1 deletion arch/x86/entry/syscalls/syscall_64.tbl
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@
442 common mount_setattr sys_mount_setattr

444 common print_xrp_stats sys_print_xrp_stats
445 common read_xrp sys_read_xrp
445 common read_xrp sys_read_xrp
446 common test_xrp sys_test_xrp

#
Expand Down
52 changes: 48 additions & 4 deletions drivers/nvme/host/pci.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#include <linux/pci-p2pdma.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>

#include "trace.h"
#include "nvme.h"
Expand Down Expand Up @@ -1066,42 +1069,67 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
struct bpf_xrp_kern ebpf_context;
u32 ebpf_return;
loff_t file_offset, data_len;
struct files_struct *files_struct;
struct file *file;
struct inode *inode;
s32 fd;
struct fdtable *fdt;
u64 disk_offset;
ktime_t ebpf_start;
ktime_t resubmit_start = ktime_get();

struct xrp_mapping mapping;
ktime_t extent_lookup_start;

fd = req->bio->xrp_cur_fd;
files_struct = req->bio->xrp_fdtable;
fdt = files_fdtable(files_struct);

if (!fd_is_open(fd, fdt)) {
printk("nvme_handle_cqe: bad file descriptor given %d, dump context\n", fd);
ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
return;
}

file = get_file(files_lookup_fd_rcu(files_struct, fd));
inode = file->f_inode;

/* verify version number */
if (req->bio->xrp_count > 1
&& req->bio->xrp_inode->i_op == &ext4_file_inode_operations) {
&& inode->i_op == &ext4_file_inode_operations) {
file_offset = req->bio->xrp_file_offset;
data_len = 512;

extent_lookup_start = ktime_get();
xrp_retrieve_mapping(req->bio->xrp_inode, file_offset, data_len, &mapping);
xrp_retrieve_mapping(inode, file_offset, data_len, &mapping);
atomic_long_add(ktime_sub(ktime_get(), extent_lookup_start), &xrp_extent_lookup_time);
atomic_long_inc(&xrp_extent_lookup_count);
if (!mapping.exist || mapping.len < data_len || mapping.address & 0x1ff) {
printk("nvme_handle_cqe: failed to retrieve address mapping during verification with logical address 0x%llx, dump context\n", file_offset);
ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
fput(file);
return;
} else if (mapping.version != req->bio->xrp_extent_version) {
printk("nvme_handle_cqe: version mismatch with logical address 0x%llx (expected %lld, got %lld), dump context\n",
file_offset, req->bio->xrp_extent_version, mapping.version);
ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
fput(file);
return;
}
}
fput(file);

memset(&ebpf_context, 0, sizeof(struct bpf_xrp_kern));
ebpf_context.data = page_address(bio_page(req->bio));
ebpf_context.scratch = page_address(req->bio->xrp_scratch_page);
ebpf_context.cur_addr = req->bio->xrp_file_offset;
ebpf_context.cur_fd = req->bio->xrp_cur_fd;
ebpf_start = ktime_get();
ebpf_prog = req->bio->xrp_bpf_prog;
ebpf_return = BPF_PROG_RUN(ebpf_prog, &ebpf_context);
Expand Down Expand Up @@ -1132,20 +1160,35 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
return;
}
/* address mapping */
fd = ebpf_context.fd_arr[0];
file_offset = ebpf_context.next_addr[0];
data_len = 512;
// FIXME: support variable data_len and more than one next_addr
req->bio->xrp_file_offset = file_offset;
if (req->bio->xrp_inode->i_op == &ext4_file_inode_operations) {
req->bio->xrp_cur_fd = fd;

if (!fd_is_open(fd, fdt)) {
printk("nvme_handle_cqe: bad file descriptor given %d, dump context\n", fd);
ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
return;
}

file = get_file(files_lookup_fd_rcu(files_struct, fd));
inode = file->f_inode;

if (inode->i_op == &ext4_file_inode_operations) {
extent_lookup_start = ktime_get();
xrp_retrieve_mapping(req->bio->xrp_inode, file_offset, data_len, &mapping);
xrp_retrieve_mapping(inode, file_offset, data_len, &mapping);
atomic_long_add(ktime_sub(ktime_get(), extent_lookup_start), &xrp_extent_lookup_time);
atomic_long_inc(&xrp_extent_lookup_count);
if (!mapping.exist || mapping.len < data_len || mapping.address & 0x1ff) {
printk("nvme_handle_cqe: failed to retrieve address mapping with logical address 0x%llx, dump context\n", file_offset);
ebpf_dump_page((uint8_t *) ebpf_context.scratch, 4096);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
fput(file);
return;
} else {
req->bio->xrp_extent_version = mapping.version;
Expand All @@ -1155,6 +1198,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
/* no address translation, use direct map */
disk_offset = file_offset;
}
fput(file);
nvme_req(req)->cmd = req->xrp_command;
req->bio->xrp_count += 1;
req->bio->bi_iter.bi_sector = (disk_offset >> 9) + req->bio->xrp_partition_start_sector;
Expand Down
8 changes: 6 additions & 2 deletions fs/block_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ret = bio.bi_iter.bi_size;

bio.xrp_enabled = iocb->xrp_enabled;
bio.xrp_inode = file->f_inode;
bio.xrp_partition_start_sector = 0;
bio.xrp_count = 1;
bio.xrp_fdtable = current->files; // TODO: investigate locking required
bio.xrp_cur_fd = iocb->xrp_cur_fd;
bio.xrp_file_offset = iocb->xrp_file_offset;
if (bio.xrp_enabled) {
if (get_user_pages_fast(iocb->xrp_scratch_buf, 1, FOLL_WRITE, &bio.xrp_scratch_page) != 1) {
printk("__blkdev_direct_IO_simple: failed to get scratch page\n");
Expand Down Expand Up @@ -458,9 +460,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}

bio->xrp_enabled = iocb->xrp_enabled;
bio->xrp_inode = file->f_inode;
bio->xrp_partition_start_sector = 0;
bio->xrp_count = 1;
bio->xrp_fdtable = current->files;
bio->xrp_cur_fd = iocb->xrp_cur_fd;
bio->xrp_file_offset = iocb->xrp_file_offset;
if (bio->xrp_enabled) {
if (get_user_pages_fast(iocb->xrp_scratch_buf, 1, FOLL_WRITE, &bio->xrp_scratch_page) != 1) {
printk("__blkdev_direct_IO: failed to get scratch page\n");
Expand Down
2 changes: 2 additions & 0 deletions fs/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -2743,6 +2743,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kiocb->xrp_enabled = true;
kiocb->xrp_scratch_buf = (char __user *) sqe->scratch;
kiocb->xrp_bpf_fd = (unsigned int) sqe->bpf_fd;
kiocb->xrp_cur_fd = (unsigned int) sqe->fd;
kiocb->xrp_file_offset = sqe->off;
}

req->rw.addr = READ_ONCE(sqe->addr);
Expand Down
15 changes: 15 additions & 0 deletions fs/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,21 @@ static bool xrp_is_valid_access(int off, int size, enum bpf_access_type type, co
info->reg_type = PTR_TO_MEM;
info->mem_size = PAGE_SIZE;
break;
case bpf_ctx_range(struct bpf_xrp, fd_arr):
size_of_field = sizeof_field(struct bpf_xrp, fd_arr);
if (!bpf_ctx_narrow_access_ok(off, size, size_of_field))
return false;
break;
case bpf_ctx_range(struct bpf_xrp, cur_addr):
size_of_field = sizeof_field(struct bpf_xrp, cur_addr);
if (type != BPF_READ || !bpf_ctx_narrow_access_ok(off, size, size_of_field))
return false;
break;
case bpf_ctx_range(struct bpf_xrp, cur_fd):
size_of_field = sizeof_field(struct bpf_xrp, cur_fd);
if (type != BPF_READ || !bpf_ctx_narrow_access_ok(off, size, size_of_field))
return false;
break;
default:
return false;
}
Expand Down
4 changes: 3 additions & 1 deletion fs/iomap/direct-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,9 +334,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
}

bio->xrp_enabled = dio->iocb->xrp_enabled;
bio->xrp_inode = dio->iocb->ki_filp->f_inode;
bio->xrp_partition_start_sector = 0;
bio->xrp_count = 1;
bio->xrp_fdtable = current->files;
bio->xrp_cur_fd = dio->iocb->xrp_cur_fd;
bio->xrp_file_offset = dio->iocb->xrp_file_offset;
if (bio->xrp_enabled) {
if (get_user_pages_fast(dio->iocb->xrp_scratch_buf, 1, FOLL_WRITE, &bio->xrp_scratch_page) != 1) {
printk("iomap_dio_bio_actor: failed to get scratch page\n");
Expand Down
11 changes: 7 additions & 4 deletions fs/read_write.c
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,8 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
return ret;
}

static ssize_t new_sync_read_xrp(struct file *filp, char __user *data_buf, size_t len, loff_t *ppos, unsigned int bpf_fd, char __user *scratch_buf)
static ssize_t new_sync_read_xrp(struct file *filp, unsigned int fd, char __user *data_buf, size_t len, loff_t *ppos,
unsigned int bpf_fd, char __user *scratch_buf)
{
struct iovec iov = { .iov_base = data_buf, .iov_len = len };
struct kiocb kiocb;
Expand All @@ -431,6 +432,8 @@ static ssize_t new_sync_read_xrp(struct file *filp, char __user *data_buf, size_
kiocb.xrp_enabled = true;
kiocb.xrp_scratch_buf = scratch_buf;
kiocb.xrp_bpf_fd = bpf_fd;
kiocb.xrp_cur_fd = fd;
kiocb.xrp_file_offset = *ppos;
iov_iter_init(&iter, READ, &iov, 1, len);

ret = call_read_iter(filp, &kiocb, &iter);
Expand Down Expand Up @@ -525,7 +528,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return ret;
}

ssize_t vfs_read_xrp(struct file *file, char __user *data_buf, size_t count, loff_t *pos, unsigned int bpf_fd, char __user *scratch_buf)
ssize_t vfs_read_xrp(struct file *file, unsigned int fd, char __user *data_buf, size_t count, loff_t *pos, unsigned int bpf_fd, char __user *scratch_buf)
{
ssize_t ret;

Expand All @@ -547,7 +550,7 @@ ssize_t vfs_read_xrp(struct file *file, char __user *data_buf, size_t count, lof
if (file->f_op->read)
ret = file->f_op->read(file, data_buf, count, pos);
else if (file->f_op->read_iter)
ret = new_sync_read_xrp(file, data_buf, count, pos, bpf_fd, scratch_buf);
ret = new_sync_read_xrp(file, fd, data_buf, count, pos, bpf_fd, scratch_buf);
else
ret = -EINVAL;
if (ret > 0) {
Expand Down Expand Up @@ -757,7 +760,7 @@ ssize_t ksys_read_xrp(unsigned int fd, char __user *data_buf,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
ret = vfs_read_xrp(f.file, data_buf, count, &pos, bpf_fd, scratch_buf);
ret = vfs_read_xrp(f.file, fd, data_buf, count, &pos, bpf_fd, scratch_buf);
fdput(f);
}

Expand Down
3 changes: 2 additions & 1 deletion include/linux/blk_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,13 +277,14 @@ struct bio {
struct bio_set *bi_pool;

bool xrp_enabled;
struct inode *xrp_inode;
u64 xrp_partition_start_sector;
int xrp_count;
struct page *xrp_scratch_page;
struct bpf_prog *xrp_bpf_prog;
u64 xrp_extent_version;
loff_t xrp_file_offset;
struct files_struct *xrp_fdtable;
s32 xrp_cur_fd;

/*
* We can inline a number of vecs at the end of the bio, to avoid
Expand Down
3 changes: 3 additions & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1478,6 +1478,9 @@ struct bpf_xrp_kern {
uint64_t size[16];
char *data;
char *scratch;
__s32 fd_arr[16];
__u64 cur_addr;
__s32 cur_fd;
};

struct xrp_stats {
Expand Down
2 changes: 2 additions & 0 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,8 @@ struct kiocb {
bool xrp_enabled;
char __user *xrp_scratch_buf;
unsigned int xrp_bpf_fd;
unsigned int xrp_cur_fd;
unsigned long xrp_file_offset;

/* The 'ki_filp' pointer is shared in a union for aio */
randomized_struct_fields_start
Expand Down
9 changes: 4 additions & 5 deletions include/linux/syscalls.h
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,6 @@ asmlinkage long sys_writev(unsigned long fd,
unsigned long vlen);
asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
size_t count, loff_t pos);
asmlinkage long sys_read_xrp(unsigned int fd, char __user *buf,
size_t count, loff_t pos, unsigned int bpf_fd, char __user *scratch_buf);
asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos);
asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
Expand Down Expand Up @@ -1258,6 +1256,10 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long fd, unsigned long pgoff);
asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);

asmlinkage long sys_print_xrp_stats(struct xrp_stats __user *buf);
asmlinkage long sys_read_xrp(unsigned int fd, char __user *buf,
size_t count, loff_t pos, unsigned int bpf_fd, char __user *scratch_buf);
asmlinkage long sys_test_xrp(char __user *data_buf, char __user *scratch_buf, unsigned int bpf_fd);

/*
* Not a real system call, but a placeholder for syscalls which are
Expand Down Expand Up @@ -1367,9 +1369,6 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
unsigned int nsops,
const struct old_timespec32 __user *timeout);

asmlinkage long sys_print_xrp_stats(struct xrp_stats __user *buf);
asmlinkage long sys_test_xrp(char __user *data_buf, char __user *scratch_buf, unsigned int bpf_fd);

int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
int __user *optlen);
int __sys_setsockopt(int fd, int level, int optname, char __user *optval,
Expand Down
9 changes: 8 additions & 1 deletion include/uapi/asm-generic/unistd.h
Original file line number Diff line number Diff line change
Expand Up @@ -864,8 +864,15 @@ __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
#define __NR_mount_setattr 442
__SYSCALL(__NR_mount_setattr, sys_mount_setattr)

#define __NR_print_xrp_stats 444
__SYSCALL(__NR_print_xrp_stats, sys_print_xrp_stats)
#define __NR_read_xrp 445
__SYSCALL(__NR_read_xrp, sys_read_xrp)
#define __NR_test_xrp 446
__SYSCALL(__NR_test_xrp, sys_test_xrp)

#undef __NR_syscalls
#define __NR_syscalls 443
#define __NR_syscalls 447

/*
* 32 bit systems traditionally used different
Expand Down
3 changes: 3 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -5261,6 +5261,9 @@ struct bpf_xrp {
__u64 size[16];
char *data;
char *scratch;
__s32 fd_arr[16];
__u64 cur_addr;
__s32 cur_fd;
};

#endif /* _UAPI__LINUX_BPF_H__ */