diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index d0a51c663c803f..a09d8264c02216 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1270,6 +1270,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif // !TARGET_XARCH void genLclHeap(GenTree* tree); + void genCodeForMemmove(GenTreeBlk* tree); bool genIsRegCandidateLocal(GenTree* tree) { diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 976ee0eddcb387..e3a55a3a417dc0 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2554,6 +2554,152 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta) inst_Mov(TYP_I_IMPL, REG_SPBASE, regSpDelta, /* canSkip */ false); } +//------------------------------------------------------------------------ +// genCodeForMemmove: Perform an unrolled memmove. The idea that we can +// ignore the fact that dst and src might overlap if we save the whole +// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120): +// +// vmovdqu ymm0, ymmword ptr[rax + 0] +// vmovdqu ymm1, ymmword ptr[rax + 32] +// vmovdqu ymm2, ymmword ptr[rax + 64] +// vmovdqu ymm3, ymmword ptr[rax + 88] +// vmovdqu ymmword ptr[rcx + 0], ymm0 +// vmovdqu ymmword ptr[rcx + 32], ymm1 +// vmovdqu ymmword ptr[rcx + 64], ymm2 +// vmovdqu ymmword ptr[rcx + 88], ymm3 +// +// Arguments: +// tree - GenTreeBlk node +// +void CodeGen::genCodeForMemmove(GenTreeBlk* tree) +{ + // Not yet finished for x86 + assert(TARGET_POINTER_SIZE == 8); + + // TODO-CQ: Support addressing modes, for now we don't use them + GenTreeIndir* srcIndir = tree->Data()->AsIndir(); + assert(srcIndir->isContained() && !srcIndir->Addr()->isContained()); + + regNumber dst = genConsumeReg(tree->Addr()); + regNumber src = genConsumeReg(srcIndir->Addr()); + unsigned size = tree->Size(); + + // TODO-XARCH-AVX512: Consider enabling it here + unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) + ? YMM_REGSIZE_BYTES + : XMM_REGSIZE_BYTES; + + if (size >= simdSize) + { + // Number of SIMD regs needed to save the whole src to regs. + unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT); + + // Lowering takes care to only introduce this node such that we will always have enough + // temporary SIMD registers to fully load the source and avoid any potential issues with overlap. + assert(numberOfSimdRegs * simdSize >= size); + + // Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount + regNumber tempRegs[LinearScan::MaxInternalCount] = {}; + for (unsigned i = 0; i < numberOfSimdRegs; i++) + { + tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT); + } + + auto emitSimdLoadStore = [&](bool load) { + unsigned offset = 0; + int regIndex = 0; + instruction simdMov = simdUnalignedMovIns(); + do + { + if (load) + { + // vmovdqu ymm, ymmword ptr[src + offset] + GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], src, offset); + } + else + { + // vmovdqu ymmword ptr[dst + offset], ymm + GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], dst, offset); + } + offset += simdSize; + if (size == offset) + { + break; + } + + assert(size > offset); + if ((size - offset) < simdSize) + { + // Overlap with the previosly processed data. We'll always use SIMD for that for simplicity + // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. + offset = size - simdSize; + } + } while (true); + }; + + // load everything from SRC to temp regs + emitSimdLoadStore(/* load */ true); + // store them to DST + emitSimdLoadStore(/* load */ false); + } + else + { + // Here we work with size 1..15 (x64) + assert((size > 0) && (size < XMM_REGSIZE_BYTES)); + + auto emitScalarLoadStore = [&](bool load, int size, regNumber tempReg, int offset) { + var_types memType; + switch (size) + { + case 1: + memType = TYP_UBYTE; + break; + case 2: + memType = TYP_USHORT; + break; + case 4: + memType = TYP_INT; + break; + case 8: + memType = TYP_LONG; + break; + default: + unreached(); + } + + if (load) + { + // mov reg, qword ptr [src + offset] + GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset); + } + else + { + // mov qword ptr [dst + offset], reg + GetEmitter()->emitIns_AR_R(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset); + } + }; + + // Use overlapping loads/stores, e. g. for size == 9: "mov [dst], tmpReg1; mov [dst+1], tmpReg2". + unsigned loadStoreSize = 1 << BitOperations::Log2(size); + if (loadStoreSize == size) + { + regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT); + emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg, 0); + emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg, 0); + } + else + { + assert(tree->AvailableTempRegCount() == 2); + regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT); + regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT); + emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0); + emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize); + emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0); + emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize); + } + } +} + //------------------------------------------------------------------------ // genLclHeap: Generate code for localloc. // @@ -2921,6 +3067,7 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode) genCodeForInitBlkRepStos(storeBlkNode); } break; + case GenTreeBlk::BlkOpKindUnrollMemmove: case GenTreeBlk::BlkOpKindUnroll: if (isCopyBlk) { @@ -2930,7 +3077,15 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode) GetEmitter()->emitDisableGC(); } #endif - genCodeForCpBlkUnroll(storeBlkNode); + if (storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll) + { + genCodeForCpBlkUnroll(storeBlkNode); + } + else + { + assert(storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove); + genCodeForMemmove(storeBlkNode); + } #ifndef JIT32_GCENCODER if (storeBlkNode->gtBlkOpGcUnsafe) { diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 8a6f8e775ae144..29a727d0ee75bd 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8925,8 +8925,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX public: enum UnrollKind { - Memset, // Initializing memory with some value - Memcpy // Copying memory from src to dst + Memset, + Memcpy, + Memmove }; //------------------------------------------------------------------------ @@ -8956,7 +8957,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX threshold *= 2; #elif defined(TARGET_XARCH) // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial - threshold = max(threshold, YMM_REGSIZE_BYTES); + threshold = min(threshold, YMM_REGSIZE_BYTES); #endif } #if defined(TARGET_XARCH) @@ -8989,7 +8990,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // // We might want to use a different multiplier for trully hot/cold blocks based on PGO data // - return threshold * 4; + threshold *= 4; + + // NOTE: Memmove's unrolling is currently limitted with LSRA - + // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu. + return threshold; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f811a72061f9a8..1f291dec92f772 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -11977,6 +11977,10 @@ void Compiler::gtDispTree(GenTree* tree, case GenTreeBlk::BlkOpKindUnroll: printf(" (Unroll)"); break; + + case GenTreeBlk::BlkOpKindUnrollMemmove: + printf(" (Memmove)"); + break; #ifndef TARGET_X86 case GenTreeBlk::BlkOpKindHelper: printf(" (Helper)"); diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 7931644f0478cf..263e21fe556fe9 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7322,6 +7322,7 @@ struct GenTreeBlk : public GenTreeIndir BlkOpKindRepInstr, #endif BlkOpKindUnroll, + BlkOpKindUnrollMemmove, } gtBlkOpKind; #ifndef JIT32_GCENCODER diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 9b18d14a12f333..cb75aefc5a6e52 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -3810,6 +3810,13 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, break; } + case NI_System_Buffer_Memmove: + { + // We'll try to unroll this in lower for constant input. + isSpecial = true; + break; + } + case NI_System_BitConverter_DoubleToInt64Bits: { GenTree* op1 = impStackTop().val; @@ -7903,6 +7910,13 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) result = NI_System_BitConverter_Int64BitsToDouble; } } + else if (strcmp(className, "Buffer") == 0) + { + if (strcmp(methodName, "Memmove") == 0) + { + result = NI_System_Buffer_Memmove; + } + } break; } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6894843d8f702a..7d01790ddf2b47 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -473,8 +473,14 @@ GenTree* Lowering::LowerNode(GenTree* node) return LowerSwitch(node); case GT_CALL: - LowerCall(node); - break; + { + GenTree* newNode = LowerCall(node); + if (newNode != nullptr) + { + return newNode; + } + } + break; case GT_LT: case GT_LE: @@ -1775,6 +1781,56 @@ GenTree* Lowering::AddrGen(void* addr) return AddrGen((ssize_t)addr); } +//------------------------------------------------------------------------ +// LowerCallMemmove: Replace Buffer.Memmove(DST, SRC, CNS_SIZE) with a GT_STORE_BLK: +// +// * STORE_BLK struct (copy) (Unroll) +// +--* LCL_VAR byref dst +// \--* IND struct +// \--* LCL_VAR byref src +// +// Arguments: +// tree - GenTreeCall node to replace with STORE_BLK +// +GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) +{ + assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove); + assert(call->gtArgs.CountArgs() == 3); + + GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode(); + if (lengthArg->IsIntegralConst()) + { + ssize_t cnsSize = lengthArg->AsIntCon()->IconValue(); + // TODO-CQ: drop the whole thing in case of 0 + if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove))) + { + GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode(); + GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode(); + + // TODO-CQ: Try to create an addressing mode + GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr); + srcBlk->gtFlags |= GTF_GLOB_REF; + srcBlk->SetContained(); + + GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK) + GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, dstAddr, srcBlk, comp->typGetBlkLayout((unsigned)cnsSize)); + storeBlk->gtFlags |= (GTF_BLK_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF); + + // TODO-CQ: Use GenTreeObj::BlkOpKindUnroll here if srcAddr and dstAddr don't overlap, thus, we can + // unroll this memmove as memcpy - it doesn't require lots of temp registers + storeBlk->gtBlkOpKind = GenTreeObj::BlkOpKindUnrollMemmove; + + BlockRange().InsertBefore(call, srcBlk); + BlockRange().InsertBefore(call, storeBlk); + BlockRange().Remove(lengthArg); + BlockRange().Remove(call); + + return storeBlk; + } + } + return nullptr; +} + // do lowering steps for a call // this includes: // - adding the placement nodes (either stack or register variety) for arguments @@ -1782,7 +1838,7 @@ GenTree* Lowering::AddrGen(void* addr) // - adding nodes for other operations that occur after the call sequence starts and before // control transfer occurs (profiling and tail call helpers, pinvoke incantations) // -void Lowering::LowerCall(GenTree* node) +GenTree* Lowering::LowerCall(GenTree* node) { GenTreeCall* call = node->AsCall(); @@ -1793,6 +1849,20 @@ void Lowering::LowerCall(GenTree* node) // All runtime lookups are expected to be expanded in fgExpandRuntimeLookups assert(!call->IsExpRuntimeLookup()); + if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC) + { +#ifdef TARGET_AMD64 + if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove) + { + GenTree* newNode = LowerCallMemmove(call); + if (newNode != nullptr) + { + return newNode->gtNext; + } + } +#endif + } + call->ClearOtherRegs(); LowerArgsForCall(call); @@ -1911,6 +1981,7 @@ void Lowering::LowerCall(GenTree* node) JITDUMP("lowering call (after):\n"); DISPTREERANGE(BlockRange(), call); JITDUMP("\n"); + return nullptr; } // Inserts profiler hook, GT_PROF_HOOK for a tail call node. diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 2a7eb25ca9c7d9..4b67851bde8e59 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -126,7 +126,8 @@ class Lowering final : public Phase // ------------------------------ // Call Lowering // ------------------------------ - void LowerCall(GenTree* call); + GenTree* LowerCall(GenTree* call); + GenTree* LowerCallMemmove(GenTreeCall* call); void LowerCFGCall(GenTreeCall* call); void MoveCFGCallArg(GenTreeCall* call, GenTree* node); #ifndef TARGET_64BIT diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 6c8b961a44c7d5..9170ec742c9aa8 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1831,12 +1831,15 @@ class LinearScan : public LinearScanInterface RefPosition* tgtPrefUse = nullptr; RefPosition* tgtPrefUse2 = nullptr; +public: // The following keep track of information about internal (temporary register) intervals // during the building of a single node. static const int MaxInternalCount = 5; - RefPosition* internalDefs[MaxInternalCount]; - int internalCount = 0; - bool setInternalRegsDelayFree; + +private: + RefPosition* internalDefs[MaxInternalCount]; + int internalCount = 0; + bool setInternalRegsDelayFree; // When a RefTypeUse is marked as 'delayRegFree', we also want to mark the RefTypeDef // in the next Location as 'hasInterferingUses'. This is accomplished by setting this diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index a6fbce9b40cb4a..dc05ffab88c6db 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -964,6 +964,7 @@ regMaskTP LinearScan::getKillSetForBlockStore(GenTreeBlk* blkNode) } break; #endif + case GenTreeBlk::BlkOpKindUnrollMemmove: case GenTreeBlk::BlkOpKindUnroll: case GenTreeBlk::BlkOpKindInvalid: // for these 'gtBlkOpKind' kinds, we leave 'killMask' = RBM_NONE diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 78bab951c3f99f..053dd1f1850fa5 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1509,6 +1509,68 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindUnrollMemmove: + { + // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that + // we can ignore the fact that dst and src might overlap if we save the whole dst + // to temp regs in advance, e.g. for memmove(rax, rcx, 120): + // + // vmovdqu ymm0, ymmword ptr[rax + 0] + // vmovdqu ymm1, ymmword ptr[rax + 32] + // vmovdqu ymm2, ymmword ptr[rax + 64] + // vmovdqu ymm3, ymmword ptr[rax + 88] + // vmovdqu ymmword ptr[rcx + 0], ymm0 + // vmovdqu ymmword ptr[rcx + 32], ymm1 + // vmovdqu ymmword ptr[rcx + 64], ymm2 + // vmovdqu ymmword ptr[rcx + 88], ymm3 + // + + // Not yet finished for x86 + assert(TARGET_POINTER_SIZE == 8); + + // Lowering was expected to get rid of memmove in case of zero + assert(size > 0); + + // TODO-XARCH-AVX512: Consider enabling it here + unsigned simdSize = + (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) + ? YMM_REGSIZE_BYTES + : XMM_REGSIZE_BYTES; + + if (size >= simdSize) + { + unsigned simdRegs = size / simdSize; + if ((size % simdSize) != 0) + { + // TODO-CQ: Consider using GPR load/store here if the reminder is 1,2,4 or 8 + // especially if we enable AVX-512 + simdRegs++; + } + for (unsigned i = 0; i < simdRegs; i++) + { + // It's too late to revert the unrolling so we hope we'll have enough SIMD regs + // no more than MaxInternalCount. Currently, it's controlled by getUnrollThreshold(memmove) + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } + SetContainsAVXFlags(); + } + else + { + if (isPow2(size)) + { + // Single GPR for 1,2,4,8 + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } + else + { + // Any size from 3 to 15 can be handled via two GPRs + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } + } + } + break; + case GenTreeBlk::BlkOpKindRepInstr: dstAddrRegMask = RBM_RDI; srcRegMask = RBM_RSI; diff --git a/src/coreclr/jit/namedintrinsiclist.h b/src/coreclr/jit/namedintrinsiclist.h index b80a1b3254aef1..ff17b3d6c770cd 100644 --- a/src/coreclr/jit/namedintrinsiclist.h +++ b/src/coreclr/jit/namedintrinsiclist.h @@ -20,6 +20,8 @@ enum NamedIntrinsic : unsigned short NI_System_BitConverter_Int64BitsToDouble, NI_System_BitConverter_SingleToInt32Bits, + NI_System_Buffer_Memmove, + NI_SYSTEM_MATH_START, NI_System_Math_Abs, NI_System_Math_Acos, diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffer.cs b/src/libraries/System.Private.CoreLib/src/System/Buffer.cs index af8ea7b4629e89..8328c186103785 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffer.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffer.cs @@ -127,6 +127,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest Memmove(ref *(byte*)destination, ref *(byte*)source, checked((nuint)sourceBytesToCopy)); } + [Intrinsic] // Unrolled for small constant lengths internal static void Memmove(ref byte dest, ref byte src, nuint len) { // P/Invoke into the native version when the buffers are overlapping. diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs new file mode 100644 index 00000000000000..f6c71fa00f8f6e --- /dev/null +++ b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs @@ -0,0 +1,156 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Linq; +using System.Runtime.CompilerServices; + +unsafe class BufferMemmoveUnrolling +{ + static int Main() + { + // Carefully test 0..32 + TestMemmove((dst, src) => src.AsSpan(0, 0).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(0)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 1).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(1)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 2).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(2)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 3).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(3)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 4).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(4)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 5).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(5)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 6).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(6)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 7).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(7)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 8).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(8)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 9).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(9)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 10).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(10)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 11).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(11)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 12).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(12)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 13).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(13)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 14).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(14)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 15).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(15)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 16).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(16)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 17).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(17)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 18).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(18)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 19).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(19)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 20).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(20)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 21).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(21)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 22).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(22)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 23).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(23)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 24).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(24)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 25).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(25)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 26).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(26)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 27).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(27)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 28).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(28)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 29).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(29)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 30).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(30)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 31).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(31)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 32).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(32)).CopyTo(dst)); + + // Some large simds + TestMemmove((dst, src) => src.AsSpan(0, 33).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(33)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 63).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(63)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 64).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(64)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 65).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(65)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 127).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(127)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 128).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(128)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 129).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(129)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 159).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(159)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 160).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(160)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 161).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(161)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 255).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(255)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 256).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(256)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 257).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(257)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 511).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(511)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 512).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(512)).CopyTo(dst)); + + // A couple of tests for overlapped pointers + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 1).CopyTo(new Span((void*)dst, 1)), + (dst, src) => new Span((void*)src, ToVar(1)).CopyTo(new Span((void*)dst, ToVar(1)))); + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 8).CopyTo(new Span((void*)dst, 8)), + (dst, src) => new Span((void*)src, ToVar(8)).CopyTo(new Span((void*)dst, ToVar(8)))); + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 10).CopyTo(new Span((void*)dst, 10)), + (dst, src) => new Span((void*)src, ToVar(10)).CopyTo(new Span((void*)dst, ToVar(10)))); + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 17).CopyTo(new Span((void*)dst, 17)), + (dst, src) => new Span((void*)src, ToVar(17)).CopyTo(new Span((void*)dst, ToVar(17)))); + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 64).CopyTo(new Span((void*)dst, 64)), + (dst, src) => new Span((void*)src, ToVar(64)).CopyTo(new Span((void*)dst, ToVar(64)))); + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 120).CopyTo(new Span((void*)dst, 120)), + (dst, src) => new Span((void*)src, ToVar(120)).CopyTo(new Span((void*)dst, ToVar(120)))); + TestMemmoveOverlap( + (dst, src) => new Span((void*)src, 256).CopyTo(new Span((void*)dst, 256)), + (dst, src) => new Span((void*)src, ToVar(256)).CopyTo(new Span((void*)dst, ToVar(256)))); + + return 100; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static T ToVar(T t) => t; + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestMemmove(Action testAction, Action refAction) + { + // Managed arrays here also test GC info in the tests (under GCStress) + byte[] dst1 = new byte[512]; + byte[] src1 = new byte[512]; + dst1.AsSpan().Fill(0xB0); + src1.AsSpan().Fill(0x0C); + + // Clone them for "reference" action + byte[] dst2 = (byte[])dst1.Clone(); + byte[] src2 = (byte[])src1.Clone(); + + testAction(dst1, src1); + refAction(dst2, src2); + + // Make sure testAction and refAction modified the same elements + // and src wasn't changed + if (!src1.SequenceEqual(src2)) + throw new InvalidOperationException("TestMemmove: src and src2 don't match"); + + if (!dst1.SequenceEqual(dst2)) + throw new InvalidOperationException("TestMemmove: dst and dst2 don't match"); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void TestMemmoveOverlap(Action testAction, Action refAction) + { + Action testAtOffset = (srcOffset, dstOffset) => + { + byte[] src1 = Enumerable.Range(0, 1024).Select(i => (byte)i).ToArray(); + byte[] src2 = (byte[])src1.Clone(); + fixed (byte* p1 = src1) + { + fixed (byte* p2 = src2) + { + byte* pSrc1 = p1 + srcOffset; + byte* pSrc2 = p2 + srcOffset; + byte* pDst1 = p1 + dstOffset; + byte* pDst2 = p2 + dstOffset; + + testAction((IntPtr)pDst1, (IntPtr)pSrc1); + refAction((IntPtr)pDst2, (IntPtr)pSrc2); + } + } + if (!src1.SequenceEqual(src2)) + throw new InvalidOperationException("TestMemmoveOverlap: src1 and src2 don't match"); + }; + + for (int i = 0; i < 32; i++) + { + testAtOffset(i, 32); + testAtOffset(32, i); + } + testAtOffset(0, 63); + testAtOffset(0, 64); + testAtOffset(0, 127); + testAtOffset(0, 128); + testAtOffset(128, 63); + testAtOffset(128, 64); + testAtOffset(256, 127); + testAtOffset(256, 128); + } +} diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmove.csproj b/src/tests/JIT/opt/Vectorization/BufferMemmove.csproj new file mode 100644 index 00000000000000..8c7132fd350c9d --- /dev/null +++ b/src/tests/JIT/opt/Vectorization/BufferMemmove.csproj @@ -0,0 +1,10 @@ + + + Exe + True + True + + + + +