@@ -2554,6 +2554,152 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta)
25542554 inst_Mov (TYP_I_IMPL, REG_SPBASE, regSpDelta, /* canSkip */ false );
25552555}
25562556
2557+ // ------------------------------------------------------------------------
2558+ // genCodeForMemmove: Perform an unrolled memmove. The idea that we can
2559+ // ignore the fact that dst and src might overlap if we save the whole
2560+ // dst to temp regs in advance, e.g. for memmove(rax, rcx, 120):
2561+ //
2562+ // vmovdqu ymm0, ymmword ptr[rax + 0]
2563+ // vmovdqu ymm1, ymmword ptr[rax + 32]
2564+ // vmovdqu ymm2, ymmword ptr[rax + 64]
2565+ // vmovdqu ymm3, ymmword ptr[rax + 88]
2566+ // vmovdqu ymmword ptr[rcx + 0], ymm0
2567+ // vmovdqu ymmword ptr[rcx + 32], ymm1
2568+ // vmovdqu ymmword ptr[rcx + 64], ymm2
2569+ // vmovdqu ymmword ptr[rcx + 88], ymm3
2570+ //
2571+ // Arguments:
2572+ // tree - GenTreeBlk node
2573+ //
2574+ void CodeGen::genCodeForMemmove (GenTreeBlk* tree)
2575+ {
2576+ // Not yet finished for x86
2577+ assert (TARGET_POINTER_SIZE == 8 );
2578+
2579+ // TODO-CQ: Support addressing modes, for now we don't use them
2580+ GenTreeIndir* srcIndir = tree->Data ()->AsIndir ();
2581+ assert (srcIndir->isContained () && !srcIndir->Addr ()->isContained ());
2582+
2583+ regNumber dst = genConsumeReg (tree->Addr ());
2584+ regNumber src = genConsumeReg (srcIndir->Addr ());
2585+ unsigned size = tree->Size ();
2586+
2587+ // TODO-XARCH-AVX512: Consider enabling it here
2588+ unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn (InstructionSet_AVX)
2589+ ? YMM_REGSIZE_BYTES
2590+ : XMM_REGSIZE_BYTES;
2591+
2592+ if (size >= simdSize)
2593+ {
2594+ // Number of SIMD regs needed to save the whole src to regs.
2595+ unsigned numberOfSimdRegs = tree->AvailableTempRegCount (RBM_ALLFLOAT);
2596+
2597+ // Lowering takes care to only introduce this node such that we will always have enough
2598+ // temporary SIMD registers to fully load the source and avoid any potential issues with overlap.
2599+ assert (numberOfSimdRegs * simdSize >= size);
2600+
2601+ // Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount
2602+ regNumber tempRegs[LinearScan::MaxInternalCount] = {};
2603+ for (unsigned i = 0 ; i < numberOfSimdRegs; i++)
2604+ {
2605+ tempRegs[i] = tree->ExtractTempReg (RBM_ALLFLOAT);
2606+ }
2607+
2608+ auto emitSimdLoadStore = [&](bool load) {
2609+ unsigned offset = 0 ;
2610+ int regIndex = 0 ;
2611+ instruction simdMov = simdUnalignedMovIns ();
2612+ do
2613+ {
2614+ if (load)
2615+ {
2616+ // vmovdqu ymm, ymmword ptr[src + offset]
2617+ GetEmitter ()->emitIns_R_AR (simdMov, EA_ATTR (simdSize), tempRegs[regIndex++], src, offset);
2618+ }
2619+ else
2620+ {
2621+ // vmovdqu ymmword ptr[dst + offset], ymm
2622+ GetEmitter ()->emitIns_AR_R (simdMov, EA_ATTR (simdSize), tempRegs[regIndex++], dst, offset);
2623+ }
2624+ offset += simdSize;
2625+ if (size == offset)
2626+ {
2627+ break ;
2628+ }
2629+
2630+ assert (size > offset);
2631+ if ((size - offset) < simdSize)
2632+ {
2633+ // Overlap with the previosly processed data. We'll always use SIMD for that for simplicity
2634+ // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
2635+ offset = size - simdSize;
2636+ }
2637+ } while (true );
2638+ };
2639+
2640+ // load everything from SRC to temp regs
2641+ emitSimdLoadStore (/* load */ true );
2642+ // store them to DST
2643+ emitSimdLoadStore (/* load */ false );
2644+ }
2645+ else
2646+ {
2647+ // Here we work with size 1..15 (x64)
2648+ assert ((size > 0 ) && (size < XMM_REGSIZE_BYTES));
2649+
2650+ auto emitScalarLoadStore = [&](bool load, int size, regNumber tempReg, int offset) {
2651+ var_types memType;
2652+ switch (size)
2653+ {
2654+ case 1 :
2655+ memType = TYP_UBYTE;
2656+ break ;
2657+ case 2 :
2658+ memType = TYP_USHORT;
2659+ break ;
2660+ case 4 :
2661+ memType = TYP_INT;
2662+ break ;
2663+ case 8 :
2664+ memType = TYP_LONG;
2665+ break ;
2666+ default :
2667+ unreached ();
2668+ }
2669+
2670+ if (load)
2671+ {
2672+ // mov reg, qword ptr [src + offset]
2673+ GetEmitter ()->emitIns_R_AR (ins_Load (memType), emitTypeSize (memType), tempReg, src, offset);
2674+ }
2675+ else
2676+ {
2677+ // mov qword ptr [dst + offset], reg
2678+ GetEmitter ()->emitIns_AR_R (ins_Store (memType), emitTypeSize (memType), tempReg, dst, offset);
2679+ }
2680+ };
2681+
2682+ // Use overlapping loads/stores, e. g. for size == 9: "mov [dst], tmpReg1; mov [dst+1], tmpReg2".
2683+ unsigned loadStoreSize = 1 << BitOperations::Log2 (size);
2684+ if (loadStoreSize == size)
2685+ {
2686+ regNumber tmpReg = tree->GetSingleTempReg (RBM_ALLINT);
2687+ emitScalarLoadStore (/* load */ true , loadStoreSize, tmpReg, 0 );
2688+ emitScalarLoadStore (/* load */ false , loadStoreSize, tmpReg, 0 );
2689+ }
2690+ else
2691+ {
2692+ assert (tree->AvailableTempRegCount () == 2 );
2693+ regNumber tmpReg1 = tree->ExtractTempReg (RBM_ALLINT);
2694+ regNumber tmpReg2 = tree->ExtractTempReg (RBM_ALLINT);
2695+ emitScalarLoadStore (/* load */ true , loadStoreSize, tmpReg1, 0 );
2696+ emitScalarLoadStore (/* load */ true , loadStoreSize, tmpReg2, size - loadStoreSize);
2697+ emitScalarLoadStore (/* load */ false , loadStoreSize, tmpReg1, 0 );
2698+ emitScalarLoadStore (/* load */ false , loadStoreSize, tmpReg2, size - loadStoreSize);
2699+ }
2700+ }
2701+ }
2702+
25572703// ------------------------------------------------------------------------
25582704// genLclHeap: Generate code for localloc.
25592705//
@@ -2921,6 +3067,7 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
29213067 genCodeForInitBlkRepStos (storeBlkNode);
29223068 }
29233069 break ;
3070+ case GenTreeBlk::BlkOpKindUnrollMemmove:
29243071 case GenTreeBlk::BlkOpKindUnroll:
29253072 if (isCopyBlk)
29263073 {
@@ -2930,7 +3077,15 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
29303077 GetEmitter ()->emitDisableGC ();
29313078 }
29323079#endif
2933- genCodeForCpBlkUnroll (storeBlkNode);
3080+ if (storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
3081+ {
3082+ genCodeForCpBlkUnroll (storeBlkNode);
3083+ }
3084+ else
3085+ {
3086+ assert (storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
3087+ genCodeForMemmove (storeBlkNode);
3088+ }
29343089#ifndef JIT32_GCENCODER
29353090 if (storeBlkNode->gtBlkOpGcUnsafe )
29363091 {
0 commit comments