diff --git a/core/sys/linux/bits.odin b/core/sys/linux/bits.odin index 64cdd2208f9..12ae949ef63 100644 --- a/core/sys/linux/bits.odin +++ b/core/sys/linux/bits.odin @@ -1964,3 +1964,265 @@ RISCV_HWProbe_Misaligned_Scalar_Perf :: enum { UNSUPPORTED, } +IO_Uring_Enter_Flags_Bits :: enum { + GETEVENTS, + SQ_WAKEUP, + SQ_WAIT, + EXT_ARG, // Available since Linux 5.11 + REGISTERED_RING, +} + +IO_Uring_Register_Opcode :: enum uint { + REGISTER_BUFFERS = 0, + UNREGISTER_BUFFERS = 1, + REGISTER_FILES = 2, + UNREGISTER_FILES = 3, + REGISTER_EVENTFD = 4, + UNREGISTER_EVENTFD = 5, + REGISTER_FILES_UPDATE = 6, + REGISTER_EVENTFD_ASYNC = 7, + REGISTER_PROBE = 8, + REGISTER_PERSONALITY = 9, + UNREGISTER_PERSONALITY = 10, + REGISTER_RESTRICTIONS = 11, + REGISTER_ENABLE_RINGS = 12, + /* extended with tagging */ + REGISTER_FILES2 = 13, + REGISTER_FILES_UPDATE2 = 14, + REGISTER_BUFFERS2 = 15, + REGISTER_BUFFERS_UPDATE = 16, + /* set/clear io-wq thread affinities */ + REGISTER_IOWQ_AFF = 17, + UNREGISTER_IOWQ_AFF = 18, + /* set/get max number of io-wq workers */ + REGISTER_IOWQ_MAX_WORKERS = 19, + /* register/unregister io_uring fd with the ring */ + REGISTER_RING_FDS = 20, + UNREGISTER_RING_FDS = 21, + /* register ring based provide buffer group */ + REGISTER_PBUF_RING = 22, + UNREGISTER_PBUF_RING = 23, + /* sync cancelation API */ + REGISTER_SYNC_CANCEL = 24, + /* register a range of fixed file slots for automatic slot allocation */ + REGISTER_FILE_ALLOC_RANGE = 25, + /* this goes last */ + REGISTER_LAST, + /* flag added to the opcode to use a registered ring fd */ + REGISTER_USE_REGISTERED_RING = 1 << 31, +} + +IO_Uring_Setup_Flags_Bits :: enum { + // io_context is polled. + IOPOLL, + // SQ poll thread. + SQPOLL, + // sq_thread_cpu is valid. + SQ_AFF, + // app defines CQ size. + CQSIZE, + // clamp SQ/CQ ring sizes. + CLAMP, + // attach to existing wq. + ATTACH_WQ, + // start with ring disabled. + R_DISABLED, + // continue submit on error. + SUBMIT_ALL, + // Cooperative task running. When requests complete, they often require + // forcing the submitter to transition to the kernel to complete. If this + // flag is set, work will be done when the task transitions anyway, rather + // than force an inter-processor interrupt reschedule. This avoids interrupting + // a task running in userspace, and saves an IPI. + COOP_TASKRUN, + // If COOP_TASKRUN is set, get notified if task work is available for + // running and a kernel transition would be needed to run it. This sets + // IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + TASKRUN_FLAG, + // SQEs are 128 bytes. + SQE128, + // CQEs are 32 bytes. + CQE32, + // Only one task is allowed to submit requests + SINGLE_ISSUER, + // Defer running task work to get events. + // Rather than running bits of task work whenever the task transitions + // try to do it just before it is needed. + DEFER_TASKRUN, +} + +IO_Uring_Features_Bits :: enum { + SINGLE_MMAP, + NODROP, + SUBMIT_STABLE, + RW_CUR_POS, + CUR_PERSONALITY, + FAST_POLL, + POLL_32BITS, + SQPOLL_NONFIXED, + EXT_ARG, + NATIVE_WORKERS, + RSRC_TAGS, +} + +IO_Uring_CQE_Flags_Bits :: enum { + // If set, the upper 16 bits are the buffer ID. + BUFFER, + // If set, parent SQE will generate more CQE entries. + MORE, + // If set, more data to read after socket recv. + SOCK_NONEMPTY, + // Set for notification CQEs. Can be used to distinct them from sends. + NOTIF, +} + +IO_Uring_OP :: enum u8 { + NOP, + READV, + WRITEV, + FSYNC, + READ_FIXED, + WRITE_FIXED, + POLL_ADD, + POLL_REMOVE, + SYNC_FILE_RANGE, + SENDMSG, + RECVMSG, + TIMEOUT, + TIMEOUT_REMOVE, + ACCEPT, + ASYNC_CANCEL, + LINK_TIMEOUT, + CONNECT, + FALLOCATE, + OPENAT, + CLOSE, + FILES_UPDATE, + STATX, + READ, + WRITE, + FADVISE, + MADVISE, + SEND, + RECV, + OPENAT2, + EPOLL_CTL, + SPLICE, + PROVIDE_BUFFERS, + REMOVE_BUFFERS, + TEE, + SHUTDOWN, + RENAMEAT, + UNLINKAT, + MKDIRAT, + SYMLINKAT, + LINKAT, + MSG_RING, + FSETXATTR, + SETXATTR, + FGETXATTR, + GETXATTR, + SOCKET, + URING_CMD, + SEND_ZC, + SENDMSG_ZC, + READ_MULTISHOT, + WAITID, + FUTEX_WAIT, + FUTEX_WAKE, + FUTEX_WAITV, + FIXED_FD_INSTALL, + FTRUNCATE, + BIND, + LISTEN, +} + +IO_Uring_SQE_Flags_Bits :: enum { + // Use fixed fileset. + FIXED_FILE, + // Issue after inflight IO. + IO_DRAIN, + // Links next sqe. + IO_LINK, + // Like LINK, but stronger. + IO_HARDLINK, + // Always go async. + ASYNC, + // Select buffer from sq.buf_group. + BUFFER_SELECT, + // Don't post CQE if request succeeded. + CQE_SKIP_SUCCESS, +} + +IO_Uring_Poll_Add_Flags_Bits :: enum { + ADD_MULTI, + UPDATE_EVENTS, + UPDATE_USER_DATA, + ADD_LEVEL, +} + +IO_Uring_Fsync_Flags_Bits :: enum { + DATASYNC, +} + +IO_Uring_Timeout_Flags_Bits :: enum { + ABS, + UPDATE, + BOOTTIME, + REALTIME, + LINK_TIMEOUT_UPDATE, + ETIME_SUCCESS, +} + +IO_Uring_Cmd_Flags_Bits :: enum { + // use registered buffer; pass this flag along with setting sqe.buf_index. + FIXED, +} + +IO_Uring_Splice_Flags_Bits :: enum { + MOVE, + NONBLOCK, + MORE, + GIFT, + F_FD_IN_FIXED = 31, +} + +IO_Uring_Accept_Flags_Bits :: enum { + MULTISHOT, +} + +IO_Uring_Send_Recv_Flags_Bits :: enum { + /* + If set, instead of first attempting to send + or receive and arm poll if that yields an + -EAGAIN result, arm poll upfront and skip + the initial transfer attempt. + */ + RECVSEND_POLL_FIRST, + /* + Multishot recv. Sets IORING_CQE_F_MORE if + the handler will continue to report + CQEs on behalf of the same SQE. + */ + RECV_MULTISHOT, + /* + Use registered buffers, the index is stored in + the buf_index field. + */ + RECVSEND_FIXED_BUF, + /* + If set, SEND[MSG]_ZC should report + the zerocopy usage in cqe.res + for the IORING_CQE_F_NOTIF cqe. + 0 is reported if zerocopy was actually possible. + IORING_NOTIF_USAGE_ZC_COPIED if data was copied + (at least partially). + */ + SEND_ZC_REPORT_USAGE, +} + +IO_Uring_Submission_Queue_Flags_Bits :: enum { + NEED_WAKEUP, + CQ_OVERFLOW, + TASKRUN, +} diff --git a/core/sys/linux/constants.odin b/core/sys/linux/constants.odin index ceab17f6d27..c80777025ae 100644 --- a/core/sys/linux/constants.odin +++ b/core/sys/linux/constants.odin @@ -395,3 +395,13 @@ MAP_HUGE_16GB :: transmute(Map_Flags)(u32(34) << MAP_HUGE_SHIFT) /* Get window size */ TIOCGWINSZ :: 0x5413 + +IORING_TIMEOUT_CLOCK_MASK :: IO_Uring_Timeout_Flags{.BOOTTIME, .REALTIME} +IORING_TIMEOUT_UPDATE_MASK :: IO_Uring_Timeout_Flags{.UPDATE, .LINK_TIMEOUT_UPDATE} + +IORING_OFF_SQ_RING :: 0 +IORING_OFF_CQ_RING :: 0x8000000 +IORING_OFF_SQES :: 0x10000000 +IORING_OFF_PBUF_RING :: 0x80000000 +IORING_OFF_PBUF_SHIFT :: 16 +IORING_OFF_MMAP_MASK :: 0xf8000000 diff --git a/core/sys/linux/sys.odin b/core/sys/linux/sys.odin index deb22726fc4..cfd586a6690 100644 --- a/core/sys/linux/sys.odin +++ b/core/sys/linux/sys.odin @@ -510,7 +510,7 @@ sendfile :: proc "contextless" (out_fd: Fd, in_fd: Fd, offset: ^i64, count: uint Available since Linux 2.0. */ socket :: proc "contextless" (domain: Address_Family, socktype: Socket_Type, sockflags: Socket_FD_Flags, protocol: Protocol) -> (Fd, Errno) { - sock_type_flags: int = cast(int) socktype | transmute(int) sockflags + sock_type_flags: int = cast(int) socktype | cast(int) transmute(i32) sockflags ret := syscall(SYS_socket, domain, sock_type_flags, protocol) return errno_unwrap(ret, Fd) } @@ -543,7 +543,7 @@ where T == Sock_Addr_Any { addr_len: i32 = size_of(T) - ret := syscall(SYS_accept4, sock, addr, &addr_len, transmute(int) sockflags) + ret := syscall(SYS_accept4, sock, addr, &addr_len, transmute(i32) sockflags) return errno_unwrap(ret, Fd) } @@ -2927,11 +2927,46 @@ statx :: proc "contextless" (dir: Fd, pathname: cstring, flags: FD_Flags, mask: // TODO(flysand): pidfd_send_signal -// TODO(flysand): io_uring_setup +/* + Setup a context for performing asynchronous I/O. + + Available since Linux 5.1 +*/ +io_uring_setup :: proc "contextless" (entries: u32, params: ^IO_Uring_Params) -> (Fd, Errno) { + ret := syscall(SYS_io_uring_setup, entries, params) + return errno_unwrap(ret, Fd) +} -// TODO(flysand): io_uring_enter +/* + Initiate and/or complete I/O using the shared submission and completion queues. -// TODO(flysand): io_uring_register + Available since Linux 5.1 +*/ +io_uring_enter :: proc "contextless" (fd: Fd, to_submit: u32, min_complete: u32, flags: IO_Uring_Enter_Flags, sig: ^Sig_Set) -> (int, Errno) { + ret := syscall(SYS_io_uring_enter, fd, to_submit, min_complete, transmute(u32)flags, sig, size_of(Sig_Set) if sig != nil else 0) + return errno_unwrap(ret, int) +} + +/* + Initiate and.or complete I/O using the shared submission and completion queues. + + Available since Linux 5.11 +*/ +io_uring_enter2 :: proc "contextless" (fd: Fd, to_submit: u32, min_complete: u32, flags: IO_Uring_Enter_Flags, arg: ^IO_Uring_Getevents_Arg) -> (int, Errno) { + assert_contextless(.EXT_ARG in flags) + ret := syscall(SYS_io_uring_enter, fd, to_submit, min_complete, transmute(u32)flags, arg, size_of(IO_Uring_Getevents_Arg)) + return errno_unwrap(ret, int) +} + +/* + Register files or user buffers for asynchronous I/O. + + Available since Linux 5.1 +*/ +io_uring_register :: proc "contextless" (fd: Fd, opcode: IO_Uring_Register_Opcode, arg: rawptr, nr_args: u32) -> Errno { + ret := syscall(SYS_io_uring_register, fd, opcode, arg, nr_args) + return Errno(-ret) +} // TODO(flysand): open_tree diff --git a/core/sys/linux/types.odin b/core/sys/linux/types.odin index 38b413cddb8..a2819803caf 100644 --- a/core/sys/linux/types.odin +++ b/core/sys/linux/types.odin @@ -763,7 +763,7 @@ Sig_Action :: struct($T: typeid) { Note, on linux these are technically passed by OR'ing together with Socket_Type, our wrapper does this under the hood. */ -Socket_FD_Flags :: bit_set[Socket_FD_Flags_Bits; int] +Socket_FD_Flags :: bit_set[Socket_FD_Flags_Bits; i32] /* Address family for the socket. @@ -1488,3 +1488,206 @@ RISCV_HWProbe :: struct { raw: u64, }, } + +IO_Uring_Params :: struct { + sq_entries: u32, + cq_entries: u32, + flags: IO_Uring_Setup_Flags, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: IO_Uring_Features, + wq_fd: u32, + resv: [3]u32, + sq_off: IO_SQ_Ring_Offsets, + cq_off: IO_CQ_Ring_Offsets, +} + +IO_Uring_Setup_Flags :: bit_set[IO_Uring_Setup_Flags_Bits; u32] + +IO_Uring_Features :: bit_set[IO_Uring_Features_Bits; u32] + +IO_SQ_Ring_Offsets :: struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + flags: u32, + dropped: u32, + array: u32, + resv1: u32, + user_addr: u64, +} + +IO_CQ_Ring_Offsets :: struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, + resv1: u32, + user_addr: u64, +} + +IO_Uring_Enter_Flags :: bit_set[IO_Uring_Enter_Flags_Bits; u32] + +IO_Uring_Getevents_Arg :: struct #min_field_align(8) { + sigmask: ^Sig_Set, + sigmask_sz: u32, + // pad: u32, + ts: ^Time_Spec, +} +#assert(align_of(IO_Uring_Getevents_Arg) == 8) + +IO_Uring_Rsrc_Register :: struct($T: typeid) { + nr: u32, + resv: u32, + resv2: u64, + using _: struct #min_field_align(8) { + data: [^]T, + tags: [^]u64, + }, +} + +IO_Uring_Rsrc_Update2 :: struct($T: typeid) { + offset: u32, + resv: u32, + using _: struct #min_field_align(8) { + data: [^]T, + tags: [^]u64, + }, + nr: u32, + resv2: u32, +} + +// The completion queue entry when the .CQE32 flag is not set on setup. +IO_Uring_CQE :: struct { + // sq.data submission passed back. + user_data: u64, + // result code for this event. + res: i32, + flags: IO_Uring_CQE_Flags, +} +#assert(size_of(IO_Uring_CQE) == 16) + +// The completion queue entry when the .CQE32 flag is set on setup. +IO_Uring_CQE32 :: struct { + using _: IO_Uring_CQE, + pad: u64, + pad2: u64, +} +#assert(size_of(IO_Uring_CQE32) == 32) + +IO_Uring_CQE_Flags :: bit_set[IO_Uring_CQE_Flags_Bits; u32] +IO_Uring_SQE_Flags :: bit_set[IO_Uring_SQE_Flags_Bits; u8] + +// The submission queue entry when the .SQE128 flag is not set on setup. +IO_Uring_SQE :: struct { + opcode: IO_Uring_OP, + flags: IO_Uring_SQE_Flags, + using __ioprio: struct #raw_union { + ioprio: u16, + sq_accept_flags: IO_Uring_Accept_Flags, + sq_send_recv_flags: IO_Uring_Send_Recv_Flags, + }, + fd: Fd, + using __offset: struct #raw_union { + // Offset into file. + off: u64, + addr2: u64, + using _: struct { + cmd_op: u32, + __pad1: u32, + }, + statx: ^Statx, + }, + using __iovecs: struct #raw_union { + // Pointer to buffer or iovecs. + addr: u64, + splice_off_in: u64, + using _: struct { + level: u32, + optname: u32, + }, + }, + using __len: struct #raw_union { + // Buffer size or number of iovecs. + len: u32, + poll_flags: IO_Uring_Poll_Add_Flags, + statx_mask: Statx_Mask, + epoll_ctl_op: EPoll_Ctl_Opcode, + shutdown_how: Shutdown_How, + }, + using __contents: struct #raw_union { + rw_flags: i32, + fsync_flags: IO_Uring_Fsync_Flags, + // compatibility. + poll_events: Fd_Poll_Events, + // word-reversed for BE. + poll32_events: u32, + sync_range_flags: u32, + msg_flags: Socket_Msg, + timeout_flags: IO_Uring_Timeout_Flags, + accept_flags: Socket_FD_Flags, + cancel_flags: u32, + open_flags: Open_Flags, + statx_flags: FD_Flags, + fadvise_advice: u32, + splice_flags: IO_Uring_Splice_Flags, + rename_flags: u32, + unlink_flags: u32, + hardlink_flags: u32, + xattr_flags: u32, + msg_ring_flags: u32, + uring_cmd_flags: IO_Uring_Cmd_Flags, + }, + // Data to be passed back at completion time. + user_data: u64, + using __buffer: struct #raw_union { + // Index into fixed buffers, if used. + buf_index: u16, + // For grouped buffer selection. + buf_group: u16, + }, + // Personality to use, if used. + personality: u16, + using _: struct #raw_union { + splice_fd_in: Fd, + file_index: u32, + using _: struct { + addr_len: u16, + __pad3: [1]u16, + }, + }, + using __: struct #raw_union { + using _: struct { + addr3: u64, + __pad2: [1]u64, + }, + }, +} +#assert(size_of(IO_Uring_SQE) == 64) + +// The submission queue entry when the .SQE128 flag is set on setup. +IO_Uring_SQE128 :: struct { + using _: IO_Uring_SQE, + cmd: [64]byte, +} +#assert(size_of(IO_Uring_SQE128) == 128) + +IO_Uring_Poll_Add_Flags :: bit_set[IO_Uring_Poll_Add_Flags_Bits; u32] + +IO_Uring_Fsync_Flags :: bit_set[IO_Uring_Fsync_Flags_Bits; u32] + +IO_Uring_Timeout_Flags :: bit_set[IO_Uring_Timeout_Flags_Bits; u32] + +IO_Uring_Cmd_Flags :: bit_set[IO_Uring_Cmd_Flags_Bits; u32] + +IO_Uring_Splice_Flags :: bit_set[IO_Uring_Splice_Flags_Bits; u32] + +IO_Uring_Accept_Flags :: bit_set[IO_Uring_Accept_Flags_Bits; u16] + +IO_Uring_Send_Recv_Flags :: bit_set[IO_Uring_Send_Recv_Flags_Bits; u16] + +IO_Uring_Submission_Queue_Flags :: bit_set[IO_Uring_Submission_Queue_Flags_Bits; u32]