diff --git a/pkg/abi/nvgpu/frontend.go b/pkg/abi/nvgpu/frontend.go index 0d8dcb96c5..0f80110446 100644 --- a/pkg/abi/nvgpu/frontend.go +++ b/pkg/abi/nvgpu/frontend.go @@ -198,7 +198,7 @@ type NVOS02_PARAMETERS struct { Pad1 [4]byte } -// Bitfields in NVOS02Parameters.Flags: +// Bitfields in NVOS02_PARAMETERS.Flags: const ( NVOS02_FLAGS_ALLOC_SHIFT = 16 NVOS02_FLAGS_ALLOC_MASK = 0x3 @@ -470,6 +470,18 @@ type NVOS33_PARAMETERS struct { Flags uint32 } +// Bitfields in NVOS33_PARAMETERS.Flags: +const ( + NVOS33_FLAGS_CACHING_TYPE_SHIFT = 23 + NVOS33_FLAGS_CACHING_TYPE_MASK = 0x7 + NVOS33_FLAGS_CACHING_TYPE_CACHED = 0 + NVOS33_FLAGS_CACHING_TYPE_UNCACHED = 1 + NVOS33_FLAGS_CACHING_TYPE_WRITECOMBINED = 2 + NVOS33_FLAGS_CACHING_TYPE_WRITEBACK = 5 + NVOS33_FLAGS_CACHING_TYPE_DEFAULT = 6 + NVOS33_FLAGS_CACHING_TYPE_UNCACHED_WEAK = 7 +) + // NVOS34_PARAMETERS is the parameter type for NV_ESC_RM_UNMAP_MEMORY. // // +marshal diff --git a/pkg/hostarch/BUILD b/pkg/hostarch/BUILD index 3508c443fb..cde0764559 100644 --- a/pkg/hostarch/BUILD +++ b/pkg/hostarch/BUILD @@ -38,6 +38,7 @@ go_library( "hostarch.go", "hostarch_arm64.go", "hostarch_x86.go", + "memory_type.go", "sizes_util.go", ], visibility = ["//:sandbox"], diff --git a/pkg/hostarch/memory_type.go b/pkg/hostarch/memory_type.go new file mode 100644 index 0000000000..82d530b9b9 --- /dev/null +++ b/pkg/hostarch/memory_type.go @@ -0,0 +1,84 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostarch + +import "fmt" + +// MemoryType specifies CPU memory access behavior. +type MemoryType uint8 + +const ( + // MemoryTypeWriteBack is equivalent to Linux's default pgprot, or the + // following architectural memory types: + // + // - x86: Write-back (WB) + // + // - ARM64: Normal write-back cacheable + // + // This memory type is appropriate for typical application memory and must + // be the zero value for MemoryType. + MemoryTypeWriteBack MemoryType = iota + + // MemoryTypeWriteCombine is equivalent to Linux's pgprot_writecombine(), + // or the following architectural memory types: + // + // - x86: Write-combining (WC) + // + // - ARM64: Normal non-cacheable + MemoryTypeWriteCombine + + // MemoryTypeUncached is equivalent to Linux's pgprot_noncached(), or the + // following architectural memory types: + // + // - x86: Strong Uncacheable (UC) or Uncacheable (UC-); these differ in + // that UC- may be "downgraded" to WC by a setting of WC or (Intel only) WP + // in MTRR or EPT/NPT, but gVisor does not use MTRRs and KVM never sets WC + // or WP in EPT/NPT. + // + // - ARM64: Device-nGnRnE + MemoryTypeUncached + + // NumMemoryTypes is the number of memory types. + NumMemoryTypes +) + +// String implements fmt.Stringer.String. +func (mt MemoryType) String() string { + switch mt { + case MemoryTypeWriteBack: + return "WriteBack" + case MemoryTypeWriteCombine: + return "WriteCombine" + case MemoryTypeUncached: + return "Uncached" + default: + return fmt.Sprintf("%d", mt) + } +} + +// ShortString returns a two-character string compactly representing the +// MemoryType. +func (mt MemoryType) ShortString() string { + switch mt { + case MemoryTypeWriteBack: + return "WB" + case MemoryTypeWriteCombine: + return "WC" + case MemoryTypeUncached: + return "UC" + default: + return fmt.Sprintf("%02d", mt) + } +} diff --git a/pkg/ring0/pagetables/pagetables_aarch64.go b/pkg/ring0/pagetables/pagetables_aarch64.go index 6c2fe2a700..97ce934e08 100644 --- a/pkg/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/ring0/pagetables/pagetables_aarch64.go @@ -52,29 +52,26 @@ func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 { // Bits in page table entries. const ( - typeTable = 0x3 << 0 - typeSect = 0x1 << 0 - typePage = 0x3 << 0 - pteValid = 0x1 << 0 - pteTableBit = 0x1 << 1 - pteTypeMask = 0x3 << 0 - present = pteValid | pteTableBit - user = 0x1 << 6 /* AP[1] */ - readOnly = 0x1 << 7 /* AP[2] */ - accessed = 0x1 << 10 - dbm = 0x1 << 51 - writable = dbm - cont = 0x1 << 52 - pxn = 0x1 << 53 - xn = 0x1 << 54 - dirty = 0x1 << 55 - nG = 0x1 << 11 - shared = 0x3 << 8 -) - -const ( - mtDevicenGnRE = 0x1 << 2 - mtNormal = 0x4 << 2 + typeTable = 0x3 << 0 + typeSect = 0x1 << 0 + typePage = 0x3 << 0 + pteValid = 0x1 << 0 + pteTableBit = 0x1 << 1 + pteTypeMask = 0x3 << 0 + present = pteValid | pteTableBit + attrIndxShift = 2 + attrIndxMask = 0x7 + user = 0x1 << 6 /* AP[1] */ + readOnly = 0x1 << 7 /* AP[2] */ + accessed = 0x1 << 10 + dbm = 0x1 << 51 + writable = dbm + cont = 0x1 << 52 + pxn = 0x1 << 53 + xn = 0x1 << 54 + dirty = 0x1 << 55 + nG = 0x1 << 11 + shared = 0x3 << 8 ) const ( @@ -93,6 +90,9 @@ type MapOpts struct { // User indicates the page is a user page. User bool + + // MemoryType is the memory type. + MemoryType hostarch.MemoryType } // PTE is a page table entry. @@ -119,15 +119,15 @@ func (p *PTE) Valid() bool { //go:nosplit func (p *PTE) Opts() MapOpts { v := atomic.LoadUintptr((*uintptr)(p)) - return MapOpts{ AccessType: hostarch.AccessType{ Read: true, Write: v&readOnly == 0, Execute: v&xn == 0, }, - Global: v&nG == 0, - User: v&user != 0, + Global: v&nG == 0, + User: v&user != 0, + MemoryType: hostarch.MemoryType((v >> attrIndxShift) & attrIndxMask), } } @@ -191,11 +191,12 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) { if opts.User { v |= user - v |= mtNormal } else { v = v &^ user - v |= mtNormal } + + v |= uintptr(opts.MemoryType&attrIndxMask) << attrIndxShift + atomic.StoreUintptr((*uintptr)(p), v) } @@ -209,7 +210,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { // This should never happen. panic("unaligned physical address!") } - v := addr | typeTable | protDefault | mtNormal + v := addr | typeTable | protDefault | (uintptr(hostarch.MemoryTypeWriteBack) << attrIndxShift) atomic.StoreUintptr((*uintptr)(p), v) } diff --git a/pkg/ring0/pagetables/pagetables_amd64_test.go b/pkg/ring0/pagetables/pagetables_amd64_test.go index c27b3b10a9..2c08cf8f4b 100644 --- a/pkg/ring0/pagetables/pagetables_amd64_test.go +++ b/pkg/ring0/pagetables/pagetables_amd64_test.go @@ -74,3 +74,13 @@ func TestSplit2MPage(t *testing.T) { {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: hostarch.Read}}, }) } + +func TestNumMemoryTypes(t *testing.T) { + // The PAT accommodates up to 8 entries. However, PTE.Set() currently + // assumes that NumMemoryTypes <= 4, since the location of the most + // significant bit of the PAT index in page table entries varies depending + // on page size (and is never bit 5 == writeThroughShift + 2). + if hostarch.NumMemoryTypes > 4 { + t.Errorf("PTE.Set() and PTE.Opts() must be altered to handle %d MemoryTypes", hostarch.NumMemoryTypes) + } +} diff --git a/pkg/ring0/pagetables/pagetables_arm64_test.go b/pkg/ring0/pagetables/pagetables_arm64_test.go index 1c919ec7d8..0c73e0f728 100644 --- a/pkg/ring0/pagetables/pagetables_arm64_test.go +++ b/pkg/ring0/pagetables/pagetables_arm64_test.go @@ -79,3 +79,10 @@ func TestSplit2MPage(t *testing.T) { {0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: hostarch.Read, User: true}}, }) } + +func TestNumMemoryTypes(t *testing.T) { + // MAIR accommodates up to 8 entries. + if hostarch.NumMemoryTypes > 8 { + t.Errorf("PTE.Set() and PTE.Opts() must be altered to map %d MemoryTypes to a smaller set of MAIR entries", hostarch.NumMemoryTypes) + } +} diff --git a/pkg/ring0/pagetables/pagetables_x86.go b/pkg/ring0/pagetables/pagetables_x86.go index dc98d8452c..2109ccdf33 100644 --- a/pkg/ring0/pagetables/pagetables_x86.go +++ b/pkg/ring0/pagetables/pagetables_x86.go @@ -49,16 +49,17 @@ func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 { // Bits in page table entries. const ( - present = 0x001 - writable = 0x002 - user = 0x004 - writeThrough = 0x008 - cacheDisable = 0x010 - accessed = 0x020 - dirty = 0x040 - super = 0x080 - global = 0x100 - optionMask = executeDisable | 0xfff + present = 0x001 + writable = 0x002 + user = 0x004 + accessed = 0x020 + dirty = 0x040 + super = 0x080 + global = 0x100 + optionMask = executeDisable | 0xfff + + writeThroughShift = 3 + patIndexMask = 0x3 ) // MapOpts are x86 options. @@ -71,6 +72,9 @@ type MapOpts struct { // User indicates the page is a user page. User bool + + // MemoryType is the memory type. + MemoryType hostarch.MemoryType } // PTE is a page table entry. @@ -103,8 +107,9 @@ func (p *PTE) Opts() MapOpts { Write: v&writable != 0, Execute: v&executeDisable == 0, }, - Global: v&global != 0, - User: v&user != 0, + Global: v&global != 0, + User: v&user != 0, + MemoryType: hostarch.MemoryType((v >> writeThroughShift) & patIndexMask), } } @@ -154,6 +159,7 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) { if opts.AccessType.Write { v |= writable | dirty } + v |= uintptr(opts.MemoryType&patIndexMask) << writeThroughShift if p.IsSuper() { // Note that this is inherited from the previous instance. Set // does not change the value of Super. See above. @@ -172,7 +178,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { // This should never happen. panic("unaligned physical address!") } - v := addr | present | user | writable | accessed | dirty + v := addr | present | user | writable | accessed | dirty | (uintptr(hostarch.MemoryTypeWriteBack) << writeThroughShift) atomic.StoreUintptr((*uintptr)(p), v) } diff --git a/pkg/sentry/devices/nvproxy/frontend.go b/pkg/sentry/devices/nvproxy/frontend.go index 9e6a2d1804..171b320bdc 100644 --- a/pkg/sentry/devices/nvproxy/frontend.go +++ b/pkg/sentry/devices/nvproxy/frontend.go @@ -46,8 +46,12 @@ type frontendDevice struct { minor uint32 } +func (dev *frontendDevice) isCtlDevice() bool { + return dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR +} + func (dev *frontendDevice) basename() string { - if dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR { + if dev.isCtlDevice() { return "nvidiactl" } return fmt.Sprintf("nvidia%d", dev.minor) @@ -134,8 +138,9 @@ type frontendFD struct { // These fields are marked nosave since we do not automatically reinvoke // NV_ESC_RM_MAP_MEMORY after restore, so restored FDs have no // mmap_context. - mmapLength uint64 `state:"nosave"` - mmapInternal uintptr `state:"nosave"` + mmapLength uint64 `state:"nosave"` + mmapInternal uintptr `state:"nosave"` + mmapMemType hostarch.MemoryType `state:"nosave"` // clients are handles of clients owned by this frontendFD. clients is // protected by dev.nvp.objsMu. @@ -493,6 +498,7 @@ func rmAllocMemorySystem(fi *frontendIoctlState, ioctlParams *nvgpu.IoctlNVOS02P fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.Params.HRoot, ioctlParams.Params.HObjectNew, ioctlParams.Params.HClass, &miscObject{}, ioctlParams.Params.HObjectParent) if createMmapCtx { mapFile.mmapLength = ioctlParams.Params.Limit + 1 + mapFile.mmapMemType = getMemoryType(fi.ctx, mapFile.dev, nvgpu.NVOS33_FLAGS_CACHING_TYPE_DEFAULT) } } fi.fd.dev.nvp.objsUnlock() @@ -1343,6 +1349,15 @@ func rmMapMemory(fi *frontendIoctlState) (uintptr, error) { } if ioctlParams.Params.Status == nvgpu.NV_OK { mapFile.mmapLength = ioctlParams.Params.Length + // src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl() forces + // NVOS33_FLAGS_CACHING_TYPE_DEFAULT, but resMap implementations may + // override the "caching type", so in general the memory type depends + // on the mapped object. Conveniently, when this occurs, the caching + // type in pParms->flags must be updated for the call to + // rm_create_mmap_context(), and pParms is subsequently copied back out + // by kernel-open/nvidia/nv.c:nvidia_ioctl(), so we can get the final + // caching type from the updated ioctl params. + mapFile.mmapMemType = getMemoryType(fi.ctx, mapFile.dev, (ioctlParams.Params.Flags>>nvgpu.NVOS33_FLAGS_CACHING_TYPE_SHIFT)&nvgpu.NVOS33_FLAGS_CACHING_TYPE_MASK) } ioctlParams.FD = origFD diff --git a/pkg/sentry/devices/nvproxy/frontend_mmap.go b/pkg/sentry/devices/nvproxy/frontend_mmap.go index 8f15a2c490..7a99a43002 100644 --- a/pkg/sentry/devices/nvproxy/frontend_mmap.go +++ b/pkg/sentry/devices/nvproxy/frontend_mmap.go @@ -15,8 +15,10 @@ package nvproxy import ( + "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -75,6 +77,13 @@ func (mf *frontendFDMemmapFile) IncRef(fr memmap.FileRange, memCgID uint32) { func (mf *frontendFDMemmapFile) DecRef(fr memmap.FileRange) { } +// MemoryType implements memmap.File.MemoryType. +func (mf *frontendFDMemmapFile) MemoryType() hostarch.MemoryType { + mf.fd.mmapMu.Lock() + defer mf.fd.mmapMu.Unlock() + return mf.fd.mmapMemType +} + // DataFD implements memmap.File.DataFD. func (mf *frontendFDMemmapFile) DataFD(fr memmap.FileRange) (int, error) { return mf.FD(), nil @@ -84,3 +93,60 @@ func (mf *frontendFDMemmapFile) DataFD(fr memmap.FileRange) (int, error) { func (mf *frontendFDMemmapFile) FD() int { return int(mf.fd.hostFD) } + +func getMemoryType(ctx context.Context, mapDev *frontendDevice, cachingType uint32) hostarch.MemoryType { + // Compare kernel-open/nvidia/nv-mmap.c:nvidia_mmap_helper() => + // nv_encode_caching(). Each NVOS33_FLAGS_CACHING_TYPE_* corresponds + // directly to a NV_MEMORY_*; this is checked by asserts in + // src/nvidia/src/kernel/rmapi/mapping_cpu.c. + if !mapDev.isCtlDevice() { + // In the !NV_IS_CTL_DEVICE() branch of nvidia_mmap_helper(), + // mmap_context->caching is only honored if IS_FB_OFFSET() and + // !IS_UD_OFFSET(). We can get the information we need for + // IS_FB_OFFSET() from NV_ESC_CARD_INFO, but there doesn't seem to be + // any way for us to replicate IS_UD_OFFSET(). So we must + // conservatively specify uncacheable, which applies in all other + // cases. (This is unfortunate since it prevents us from using + // write-combining on framebuffer memory.) + if log.IsLogging(log.Debug) { + ctx.Debugf("nvproxy: inferred memory type %v for mapping of %s", hostarch.MemoryTypeUncached, mapDev.basename()) + } + return hostarch.MemoryTypeUncached + } + var memType hostarch.MemoryType + switch cachingType { + case nvgpu.NVOS33_FLAGS_CACHING_TYPE_CACHED, nvgpu.NVOS33_FLAGS_CACHING_TYPE_WRITEBACK: + // Note that nv_encode_caching() doesn't actually handle + // NV_MEMORY_WRITEBACK, so this case should fail during host mmap. + memType = hostarch.MemoryTypeWriteBack + case nvgpu.NVOS33_FLAGS_CACHING_TYPE_WRITECOMBINED, nvgpu.NVOS33_FLAGS_CACHING_TYPE_DEFAULT: + // NOTE(gvisor.dev/issue/11436): In the NV_IS_CTL_DEVICE() branch of + // nvidia_mmap_helper(), memory_type is never + // NV_MEMORY_TYPE_FRAMEBUFFER, so this corresponds to + // kernel-open/common/inc/nv-pgprot.h:NV_PGPROT_WRITE_COMBINED(). On + // ARM64, NV_PGPROT_WRITE_COMBINED() => NV_PGPROT_UNCACHED() implicitly + // uses MT_NORMAL (equivalent to our MemoryTypeWriteBack) rather than + // MT_NORMAL_NC when nvos_is_chipset_io_coherent() => + // PDB_PROP_CL_IS_CHIPSET_IO_COHERENT is true, which seems to be the + // case on most systems. We should clarify whether this is an + // optimization or required for correctness (cf. Armv8-M Architecture + // Reference Manual Sec. B7.16 "Mismatched memory attributes"), and + // subsequently whether to replicate it. + memType = hostarch.MemoryTypeWriteCombine + case nvgpu.NVOS33_FLAGS_CACHING_TYPE_UNCACHED, nvgpu.NVOS33_FLAGS_CACHING_TYPE_UNCACHED_WEAK: + // NOTE(gvisor.dev/issue/11436): On ARM64, nv_encode_caching() + // distinguishes between NV_PGPROT_UNCACHED() => MT_NORMAL/MT_NORMAL_NC + // and NV_PGPROT_UNCACHED_DEVICE() => MT_DEVICE_nGnRnE; in context, the + // former is used in the !peer_io (NV_MEMORY_TYPE_SYSTEM) case and the + // latter is used in the peer_io (NV_MEMORY_TYPE_DEVICE_MMIO) case. As + // above, we should clarify whether we need to replicate this behavior. + memType = hostarch.MemoryTypeUncached + default: + ctx.Warningf("nvproxy: unknown caching type %d", cachingType) + memType = hostarch.MemoryTypeUncached + } + if log.IsLogging(log.Debug) { + ctx.Debugf("nvproxy: inferred memory type %v for caching type %d", memType, cachingType) + } + return memType +} diff --git a/pkg/sentry/devices/nvproxy/uvm_mmap.go b/pkg/sentry/devices/nvproxy/uvm_mmap.go index f063b6c251..2241d4f37f 100644 --- a/pkg/sentry/devices/nvproxy/uvm_mmap.go +++ b/pkg/sentry/devices/nvproxy/uvm_mmap.go @@ -63,6 +63,8 @@ func (fd *uvmFD) InvalidateUnsavable(ctx context.Context) error { // +stateify savable type uvmFDMemmapFile struct { + memmap.DefaultMemoryType + fd *uvmFD } diff --git a/pkg/sentry/devices/tpuproxy/accel/accel_fd_mmap.go b/pkg/sentry/devices/tpuproxy/accel/accel_fd_mmap.go index ee06484856..b27eab9fbc 100644 --- a/pkg/sentry/devices/tpuproxy/accel/accel_fd_mmap.go +++ b/pkg/sentry/devices/tpuproxy/accel/accel_fd_mmap.go @@ -16,10 +16,7 @@ package accel import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -61,7 +58,7 @@ func (fd *accelFD) InvalidateUnsavable(ctx context.Context) error { } type accelFDMemmapFile struct { - memmap.NoBufferedIOFallback + memmap.NoMapInternal fd *accelFD } @@ -74,12 +71,6 @@ func (mf *accelFDMemmapFile) IncRef(memmap.FileRange, uint32) { func (mf *accelFDMemmapFile) DecRef(fr memmap.FileRange) { } -// MapInternal implements memmap.File.MapInternal. -func (mf *accelFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { - log.Traceback("accel: rejecting accelFDMemmapFile.MapInternal") - return safemem.BlockSeq{}, linuxerr.EINVAL -} - // DataFD implements memmap.File.DataFD. func (mf *accelFDMemmapFile) DataFD(fr memmap.FileRange) (int, error) { return mf.FD(), nil diff --git a/pkg/sentry/devices/tpuproxy/vfio/BUILD b/pkg/sentry/devices/tpuproxy/vfio/BUILD index d79ef488dd..d63539078b 100644 --- a/pkg/sentry/devices/tpuproxy/vfio/BUILD +++ b/pkg/sentry/devices/tpuproxy/vfio/BUILD @@ -13,7 +13,6 @@ go_library( "pci_device_fd.go", "pci_device_fd_mmap.go", "tpu_fd.go", - "tpu_fd_mmap.go", "vfio.go", "vfio_fd.go", "vfio_fd_mmap.go", diff --git a/pkg/sentry/devices/tpuproxy/vfio/pci_device_fd_mmap.go b/pkg/sentry/devices/tpuproxy/vfio/pci_device_fd_mmap.go index 426804806f..8750bd7de0 100644 --- a/pkg/sentry/devices/tpuproxy/vfio/pci_device_fd_mmap.go +++ b/pkg/sentry/devices/tpuproxy/vfio/pci_device_fd_mmap.go @@ -91,6 +91,13 @@ func (mf *pciDeviceFdMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.Ac return mf.pfm.MapInternal(fr, int(mf.fd.hostFD), at.Write) } +// MemoryType implements memmap.File.MemoryType. +func (mf *pciDeviceFdMemmapFile) MemoryType() hostarch.MemoryType { + // drivers/vfio/pci/vfio_pci_core.c:vfio_pci_core_mmap() uses + // pgprot_noncached(). + return hostarch.MemoryTypeUncached +} + // DataFD implements memmap.File.DataFD. func (mf *pciDeviceFdMemmapFile) DataFD(fr memmap.FileRange) (int, error) { return mf.FD(), nil diff --git a/pkg/sentry/devices/tpuproxy/vfio/tpu_fd.go b/pkg/sentry/devices/tpuproxy/vfio/tpu_fd.go index 94730f737d..cf78257fcc 100644 --- a/pkg/sentry/devices/tpuproxy/vfio/tpu_fd.go +++ b/pkg/sentry/devices/tpuproxy/vfio/tpu_fd.go @@ -59,10 +59,9 @@ type tpuFD struct { vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD - hostFD int32 - device *tpuDevice - queue waiter.Queue - memmapFile tpuFDMemmapFile + hostFD int32 + device *tpuDevice + queue waiter.Queue } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/devices/tpuproxy/vfio/tpu_fd_mmap.go b/pkg/sentry/devices/tpuproxy/vfio/tpu_fd_mmap.go deleted file mode 100644 index 7e98dfa3bf..0000000000 --- a/pkg/sentry/devices/tpuproxy/vfio/tpu_fd_mmap.go +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2024 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfio - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *tpuFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - return vfs.GenericProxyDeviceConfigureMMap(&fd.vfsfd, fd, opts) -} - -// AddMapping implements memmap.Mappable.AddMapping. -func (fd *tpuFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { - return nil -} - -// RemoveMapping implements memmap.Mappable.RemoveMapping. -func (fd *tpuFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { -} - -// CopyMapping implements memmap.Mappable.CopyMapping. -func (fd *tpuFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { - return nil -} - -// Translate implements memmap.Mappable.Translate. -func (fd *tpuFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { - return []memmap.Translation{ - { - Source: optional, - File: &fd.memmapFile, - Offset: optional.Start, - Perms: hostarch.AnyAccess, - }, - }, nil -} - -// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. -func (fd *tpuFD) InvalidateUnsavable(ctx context.Context) error { - return nil -} - -type tpuFDMemmapFile struct { - memmap.NoBufferedIOFallback - - fd *tpuFD -} - -// IncRef implements memmap.File.IncRef. -func (mf *tpuFDMemmapFile) IncRef(memmap.FileRange, uint32) { -} - -// DecRef implements memmap.File.DecRef. -func (mf *tpuFDMemmapFile) DecRef(fr memmap.FileRange) { -} - -// MapInternal implements memmap.File.MapInternal. -func (mf *tpuFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { - log.Traceback("tpuproxy: rejecting tpuFdMemmapFile.MapInternal") - return safemem.BlockSeq{}, linuxerr.EINVAL -} - -// DataFD implements memmap.File.DataFD. -func (mf *tpuFDMemmapFile) DataFD(fr memmap.FileRange) (int, error) { - return mf.FD(), nil -} - -// FD implements memmap.File.FD. -func (mf *tpuFDMemmapFile) FD() int { - return int(mf.fd.hostFD) -} diff --git a/pkg/sentry/devices/tpuproxy/vfio/vfio.go b/pkg/sentry/devices/tpuproxy/vfio/vfio.go index 4485063451..097a39209c 100644 --- a/pkg/sentry/devices/tpuproxy/vfio/vfio.go +++ b/pkg/sentry/devices/tpuproxy/vfio/vfio.go @@ -105,7 +105,6 @@ func (dev *tpuDevice) Open(ctx context.Context, mnt *vfs.Mount, d *vfs.Dentry, o unix.Close(hostFD) return nil, err } - fd.memmapFile.fd = fd return &fd.vfsfd, nil } diff --git a/pkg/sentry/devices/tpuproxy/vfio/vfio_fd_mmap.go b/pkg/sentry/devices/tpuproxy/vfio/vfio_fd_mmap.go index 361a0cc613..0e14b4c598 100644 --- a/pkg/sentry/devices/tpuproxy/vfio/vfio_fd_mmap.go +++ b/pkg/sentry/devices/tpuproxy/vfio/vfio_fd_mmap.go @@ -16,10 +16,7 @@ package vfio import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -61,7 +58,7 @@ func (fd *vfioFD) InvalidateUnsavable(ctx context.Context) error { } type vfioFDMemmapFile struct { - memmap.NoBufferedIOFallback + memmap.NoMapInternal fd *vfioFD } @@ -74,12 +71,6 @@ func (mf *vfioFDMemmapFile) IncRef(memmap.FileRange, uint32) { func (mf *vfioFDMemmapFile) DecRef(fr memmap.FileRange) { } -// MapInternal implements memmap.File.MapInternal. -func (mf *vfioFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { - log.Traceback("tpuproxy: rejecting vfioFdMemmapFile.MapInternal") - return safemem.BlockSeq{}, linuxerr.EINVAL -} - // DataFD implements memmap.File.DataFD. func (mf *vfioFDMemmapFile) DataFD(fr memmap.FileRange) (int, error) { return mf.FD(), nil diff --git a/pkg/sentry/fsimpl/erofs/regular_file.go b/pkg/sentry/fsimpl/erofs/regular_file.go index 6d5617153f..0dd37a095a 100644 --- a/pkg/sentry/fsimpl/erofs/regular_file.go +++ b/pkg/sentry/fsimpl/erofs/regular_file.go @@ -200,6 +200,7 @@ func (i *inode) InvalidateUnsavable(ctx context.Context) error { // +stateify savable type imageMemmapFile struct { + memmap.DefaultMemoryType memmap.NoBufferedIOFallback image *erofs.Image diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 42836a3761..f6b83f69a0 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -920,6 +920,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { // // +stateify savable type dentryPlatformFile struct { + memmap.DefaultMemoryType memmap.NoBufferedIOFallback *dentry diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 1a423f9c2e..d974e05f9b 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -43,6 +43,7 @@ import ( type specialFileFD struct { fileDescription specialFDEntry + memmap.DefaultMemoryType memmap.NoBufferedIOFallback // releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe diff --git a/pkg/sentry/fsimpl/kernfs/mmap_util.go b/pkg/sentry/fsimpl/kernfs/mmap_util.go index 85ca66bf09..cb01d194bb 100644 --- a/pkg/sentry/fsimpl/kernfs/mmap_util.go +++ b/pkg/sentry/fsimpl/kernfs/mmap_util.go @@ -28,6 +28,7 @@ import ( // // +stateify savable type inodePlatformFile struct { + memmap.DefaultMemoryType memmap.NoBufferedIOFallback // hostFD contains the host fd that this file was originally created from, diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index 66c9a4731f..d120e52e48 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -54,6 +54,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/hostarch", "//pkg/log", "//pkg/safemem", diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index f4f2226b1a..eb5b18bb4c 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -19,7 +19,9 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" ) @@ -470,15 +472,14 @@ type File interface { // reference is held on the mapped pages. MapInternal(fr FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) - // DataFD blocks until offsets fr in the file contain valid data, then - // returns the file descriptor represented by the File. - // - // Note that fr.Start and fr.End need not be page-aligned. + // MemoryType returns the memory type that must be used by page table + // entries mapping memory returned by MapInternal. Most implementations of + // File can embed DefaultMemoryType to obtain an appropriate implementation + // of MemoryType. // // Preconditions: - // * fr.Length() > 0. - // * At least one reference must be held on all pages in fr. - DataFD(fr FileRange) (int, error) + // * MapInternal() returned a non-empty BlockSeq. + MemoryType() hostarch.MemoryType // BufferReadAt reads len(dst) bytes from the file into dst, starting at // file offset off. It returns the number of bytes read. Like @@ -506,6 +507,16 @@ type File interface { // * At least one reference must be held on all written pages. BufferWriteAt(off uint64, src []byte) (uint64, error) + // DataFD blocks until offsets fr in the file contain valid data, then + // returns the file descriptor represented by the File. + // + // Note that fr.Start and fr.End need not be page-aligned. + // + // Preconditions: + // * fr.Length() > 0. + // * At least one reference must be held on all pages in fr. + DataFD(fr FileRange) (int, error) + // FD returns the file descriptor represented by the File. The returned // file descriptor should not be used to implement // platform.AddressSpace.MapFile, since the contents of the File may not be @@ -513,6 +524,15 @@ type File interface { FD() int } +// DefaultMemoryType implements File.MemoryType() for implementations of File +// backed by ordinary system memory. +type DefaultMemoryType struct{} + +// MemoryType implements File.MemoryType. +func (DefaultMemoryType) MemoryType() hostarch.MemoryType { + return hostarch.MemoryTypeWriteBack +} + // BufferedIOFallbackErr is returned (by value) by implementations of // File.MapInternal() that cannot succeed, but can still support memory-mapped // I/O by falling back to buffered reads and writes. @@ -538,6 +558,30 @@ func (NoBufferedIOFallback) BufferWriteAt(off uint64, src []byte) (uint64, error panic("unimplemented: memmap.File.MapInternal() should not have returned BufferedIOFallbackErr") } +// NoMapInternal implements File.MapInternal(), File.MemoryType(), +// File.BufferReadAt(), and File.BufferWriteAt() for implementations of File +// that do not support MapInternal. +type NoMapInternal struct { + NoBufferedIOFallback +} + +// MapInternal implements File.MapInternal. +func (NoMapInternal) MapInternal(fr FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { + // There is no equivalent to this situation in Linux, and hence no clear + // errno to return. We choose ENODEV since mmap() returns this in a + // somewhat similar case (mmap() called on a non-mmappable file), and + // ENODEV is relatively uncommon (compared to e.g. EINVAL) so it should be + // somewhat more distinctive if it results in an application-reported + // error. + log.Traceback("no memmap.File.MapInternal implementation available, returning ENODEV") + return safemem.BlockSeq{}, linuxerr.ENODEV +} + +// MemoryType implements File.MemoryType. +func (NoMapInternal) MemoryType() hostarch.MemoryType { + panic("memmap.File.MemoryType called without MapInternal support") +} + // FileRange represents a range of uint64 offsets into a File. // // type FileRange diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go index d927b17026..0e7fa82a9f 100644 --- a/pkg/sentry/mm/debug.go +++ b/pkg/sentry/mm/debug.go @@ -91,6 +91,6 @@ func (pseg pmaIterator) debugStringEntryLocked() []byte { b.WriteByte('s') } - fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file) + fmt.Fprintf(&b, " %s %08x %T\n", pma.file.MemoryType().ShortString(), pma.off, pma.file) return b.Bytes() } diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index e7d8f8dc6f..281389c97e 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -43,6 +43,7 @@ const pagesPerHugePage = hostarch.HugePageSize / hostarch.PageSize // MemoryFile is a memmap.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { + memmap.DefaultMemoryType memmap.NoBufferedIOFallback // MemoryFile owns a single backing file. Each page in the backing file is diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index 1b16dcb3c5..41dc9e6a2f 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -98,8 +98,9 @@ func (as *addressSpace) Touch(c *vCPU) bool { } type hostMapEntry struct { - addr uintptr - length uintptr + addr uintptr + length uintptr + memType hostarch.MemoryType } // mapLocked maps the given host entry. @@ -130,6 +131,7 @@ func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarc inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ AccessType: at, User: true, + MemoryType: m.memType, }, physical) || inv m.addr += length m.length -= length @@ -161,6 +163,7 @@ func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.Fil if err != nil { return err } + mt := f.MemoryType() // See block in mapLocked. as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() @@ -186,8 +189,9 @@ func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.Fil // Perform the mapping. prev := as.mapLocked(addr, hostMapEntry{ - addr: b.Addr(), - length: uintptr(b.Len()), + addr: b.Addr(), + length: uintptr(b.Len()), + memType: mt, }, at) inv = inv || prev addr += hostarch.Addr(b.Len()) diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go index fa51e9180b..de40b68de4 100644 --- a/pkg/sentry/platform/kvm/kvm_const_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go @@ -119,12 +119,6 @@ const ( // Arm64: Memory Attribute Indirection Register EL1. const ( - _MT_DEVICE_nGnRnE = 0 - _MT_DEVICE_nGnRE = 1 - _MT_DEVICE_GRE = 2 - _MT_NORMAL_NC = 3 - _MT_NORMAL = 4 - _MT_NORMAL_WT = 5 _MT_ATTR_DEVICE_nGnRnE = 0x00 _MT_ATTR_DEVICE_nGnRE = 0x04 _MT_ATTR_DEVICE_GRE = 0x0c @@ -132,7 +126,6 @@ const ( _MT_ATTR_NORMAL_WT = 0xbb _MT_ATTR_NORMAL = 0xff _MT_ATTR_MASK = 0xff - _MT_EL1_INIT = (_MT_ATTR_DEVICE_nGnRnE << (_MT_DEVICE_nGnRnE * 8)) | (_MT_ATTR_DEVICE_nGnRE << (_MT_DEVICE_nGnRE * 8)) | (_MT_ATTR_DEVICE_GRE << (_MT_DEVICE_GRE * 8)) | (_MT_ATTR_NORMAL_NC << (_MT_NORMAL_NC * 8)) | (_MT_ATTR_NORMAL << (_MT_NORMAL * 8)) | (_MT_ATTR_NORMAL_WT << (_MT_NORMAL_WT * 8)) ) const ( diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index cb75ddf3c2..76c1179871 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -141,6 +141,11 @@ func (c *vCPU) initArchState() error { return err } + // Set up the PAT as required by ring0/pagetables. + if err := c.setPAT(); err != nil { + return err + } + // Set the entrypoint for the kernel. kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 0d28780022..8452cc7241 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -23,6 +23,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/hostsyscall" ) @@ -72,6 +73,36 @@ func (c *vCPU) setCPUID() error { return nil } +func (c *vCPU) setPAT() error { + // See Intel SDM Vol. 3, Sec. 13.12.2 "IA32_PAT MSR", or AMD64 APM Vol. 2, + // Sec. 7.8.1 "PAT Register". + const ( + _MSR_IA32_PAT = 0x277 + + _PA_UC = 0x00 + _PA_WC = 0x01 + _PA_WB = 0x06 + ) + registers := modelControlRegisters{ + nmsrs: 1, + } + registers.entries[0].index = _MSR_IA32_PAT + if hostarch.NumMemoryTypes != 3 { + panic("additional memory types must be configured in PAT") + } + registers.entries[0].data |= _PA_WB << (hostarch.MemoryTypeWriteBack * 8) + registers.entries[0].data |= _PA_WC << (hostarch.MemoryTypeWriteCombine * 8) + registers.entries[0].data |= _PA_UC << (hostarch.MemoryTypeUncached * 8) + if errno := hostsyscall.RawSyscallErrno( + unix.SYS_IOCTL, + uintptr(c.fd), + KVM_SET_MSRS, + uintptr(unsafe.Pointer(®isters))); errno != 0 { + return fmt.Errorf("error setting PAT: %v", errno) + } + return nil +} + // getTSCFreq gets the TSC frequency. // // If mustSucceed is true, then this function panics on error. diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 4420e6fb80..53f2924e27 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -91,7 +91,13 @@ func (c *vCPU) initArchState() error { } // mair_el1 - data = _MT_EL1_INIT + if hostarch.NumMemoryTypes != 3 { + panic("additional memory types must be configured in MAIR") + } + data = 0 + data |= _MT_ATTR_NORMAL << (hostarch.MemoryTypeWriteBack * 8) + data |= _MT_ATTR_NORMAL_NC << (hostarch.MemoryTypeWriteCombine * 8) + data |= _MT_ATTR_DEVICE_nGnRnE << (hostarch.MemoryTypeUncached * 8) reg.id = _KVM_ARM64_REGS_MAIR_EL1 if err := c.setOneRegister(®); err != nil { return err