[AArch64] Gangup loads and stores for pairing.

Keep loads and stores together (target defines how many loads and stores to gang up), such that it will help in pairing and vectorization. Differential Revision https://reviews.llvm.org/D46477 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@332482 91177308-0d34-0410-b5e6-96231b3b80d8
author: Sirish Pande <sirishrp@gmail.com> 2018-05-16 15:36:52 +0000
committer: Sirish Pande <sirishrp@gmail.com> 2018-05-16 15:36:52 +0000
commit: f9deb984807b7d5f76974aa40a35c7377faf56f0 (patch)
tree: 7a70251b8f5691f555f18874210ed6f816565265
parent: 46a30b98c29af57d5543dd9c3bdbc2c1ab6beaad (diff)
10 files changed, 122 insertions, 27 deletions
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index cdb256fed4b..4a00d97a032 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -1199,6 +1199,15 @@ public:
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
+  /// \brief Get maximum # of store operations to be glued together
+  ///
+  /// This function returns the maximum number of store operations permitted
+  /// to glue together during lowering of llvm.memcpy. The value is set by
+  //  the target at the performance threshold for such a replacement.
+  virtual unsigned getMaxGluedStoresPerMemcpy() const {
+    return MaxGluedStoresPerMemcpy;
+  }
+
   /// Get maximum # of load operations permitted for memcmp
   ///
   /// This function returns the maximum number of load operations permitted
@@ -2509,6 +2518,14 @@ protected:
   /// constant size.
   unsigned MaxStoresPerMemcpy;
 
+
+  /// \brief Specify max number of store instructions to glue in inlined memcpy.
+  ///
+  /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
+  /// of store instructions to keep together. This helps in pairing and
+  //  vectorization later on.
+  unsigned MaxGluedStoresPerMemcpy = 0;
+
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f486a90380e..8f74d6a2a71 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -89,6 +89,14 @@ void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
 
 #define DEBUG_TYPE "selectiondag"
 
+static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
+       cl::Hidden, cl::init(true),
+       cl::desc("Gang up loads and stores generated by inlining of memcpy"));
+
+static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
+       cl::desc("Number limit for gluing ld/st of memcpy."),
+       cl::Hidden, cl::init(0));
+
 static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
   LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
 }
@@ -5218,6 +5226,31 @@ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
   return MF.getFunction().optForSize();
 }
 
+static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                          SmallVector<SDValue, 32> &OutChains, unsigned From,
+                          unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
+                          SmallVector<SDValue, 16> &OutStoreChains) {
+  assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
+  assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
+  SmallVector<SDValue, 16> GluedLoadChains;
+  for (unsigned i = From; i < To; ++i) {
+    OutChains.push_back(OutLoadChains[i]);
+    GluedLoadChains.push_back(OutLoadChains[i]);
+  }
+
+  // Chain for all loads.
+  SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  GluedLoadChains);
+
+  for (unsigned i = From; i < To; ++i) {
+    StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
+    SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+    OutChains.push_back(NewStore);
+  }
+}
+
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
                                        uint64_t Size, unsigned Align,
@@ -5282,7 +5315,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
   MachineMemOperand::Flags MMOFlags =
       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
-  SmallVector<SDValue, 8> OutChains;
+  SmallVector<SDValue, 16> OutLoadChains;
+  SmallVector<SDValue, 16> OutStoreChains;
+  SmallVector<SDValue, 32> OutChains;
   unsigned NumMemOps = MemOps.size();
   uint64_t SrcOff = 0, DstOff = 0;
   for (unsigned i = 0; i != NumMemOps; ++i) {
@@ -5316,11 +5351,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
         SubSlice.Length = VTSize;
       }
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
-      if (Value.getNode())
+      if (Value.getNode()) {
         Store = DAG.getStore(Chain, dl, Value,
                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
                              DstPtrInfo.getWithOffset(DstOff), Align,
                              MMOFlags);
+        OutChains.push_back(Store);
+      }
     }
 
     if (!Store.getNode()) {
@@ -5342,17 +5379,61 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
                              MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
-      OutChains.push_back(Value.getValue(1));
+      OutLoadChains.push_back(Value.getValue(1));
+
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
           DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+      OutStoreChains.push_back(Store);
     }
-    OutChains.push_back(Store);
     SrcOff += VTSize;
     DstOff += VTSize;
     Size -= VTSize;
   }
 
+  unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
+                                TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
+  unsigned NumLdStInMemcpy = OutStoreChains.size();
+
+  if (NumLdStInMemcpy) {
+    // It may be that memcpy might be converted to memset if it's memcpy
+    // of constants. In such a case, we won't have loads and stores, but
+    // just stores. In the absence of loads, there is nothing to gang up.
+    if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
+      // If target does not care, just leave as it.
+      for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
+        OutChains.push_back(OutLoadChains[i]);
+        OutChains.push_back(OutStoreChains[i]);
+      }
+    } else {
+      // Ld/St less than/equal limit set by target.
+      if (NumLdStInMemcpy <= GluedLdStLimit) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        NumLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+      } else {
+        unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
+        unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
+        unsigned GlueIter = 0;
+
+        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+          unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                       OutLoadChains, OutStoreChains);
+          GlueIter += GluedLdStLimit;
+        }
+
+        // Residual ld/st.
+        if (RemainingLdStInMemcpy) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        RemainingLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+        }
+      }
+    }
+  }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index e18a6ab185d..3a2036b15a0 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -535,6 +535,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   // Perform these initializations only once.
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
       MaxLoadsPerMemcmp = 8;
+  MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index b5b65e0b817..5bf5231f5c7 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -580,6 +580,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::GlobalAddress);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxGluedStoresPerMemcpy = 4;
+
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
index 2b6cd7c2d28..262944498c0 100644
--- a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
@@ -8,9 +8,9 @@
 
 ; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
 ; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
 ; CHECK-NEXT: str  [[VAL]], [x0, #8]
-; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: str  [[VAL2]], [x0]
 
 define void @foo(i8* %a) {
diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
index 4f8f3a227bb..951076c10b8 100644
--- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -29,10 +29,10 @@ entry:
 define void @t1(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t1:
-; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
-; CHECK: stur [[DEST]], [x0, #15]
 ; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST:q[0-9]+]], [x0, #15]
+; CHECK: str [[DEST:q[0-9]+]], [x0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i1 false)
   ret void
 }
@@ -52,9 +52,9 @@ entry:
 define void @t3(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t3:
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
 ; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
 ; CHECK: str [[REG4]], [x0, #16]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
 ; CHECK: str [[DEST]], [x0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i1 false)
   ret void
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index 0f8f4c5d4a4..7592596be4d 100644
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -130,12 +130,9 @@ define void @test_va_copy() {
 
 ; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
 
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: ldp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[SRC]]]
 ; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
-; CHECK: str [[BLOCK]], [x[[DST]]]
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
-; CHECK: str [[BLOCK]], [x[[DST]], #16]
+; CHECK: stp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[DST]]]
   ret void
 ; CHECK: ret
 }
diff --git a/test/CodeGen/AArch64/arm64-virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
index 4ce0d2f0007..cac105ee4c0 100644
--- a/test/CodeGen/AArch64/arm64-virtual_base.ll
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -34,8 +34,8 @@
 define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
 ; CHECK: Precompute_Patch_Values
 ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: str [[VAL]], [sp, #232]
 ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
 ; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
 entry:
   %Control_Points = alloca [16 x [3 x double]], align 8
diff --git a/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll b/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
index cd64ae11550..fbaef9cc075 100644
--- a/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
+++ b/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
@@ -4,16 +4,14 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios10.0.0"
 
 ; PR33475 - Expect 64-bit operations as 128-operations are not legal
+; However, we can generate a paired 64-bit loads and stores, without using
+; floating point registers.
 
 ; CHECK-LABEL: pr33475
-; CHECK-DAG: ldr [[R0:x[0-9]+]], [x1]
-; CHECK-DAG: str [[R0]], [x0]
-; CHECK-DAG: ldr [[R1:x[0-9]+]], [x1, #8]
-; CHECK-DAG: str [[R1]], [x0, #8]
-; CHECK-DAG: ldr [[R2:x[0-9]+]], [x1, #16]
-; CHECK-DAG: str [[R2]], [x0, #16]
-; CHECK-DAG: ldr [[R3:x[0-9]+]], [x1, #24]
-; CHECK-DAG: str [[R3]], [x0, #24]
+; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1, #16]
+; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1]
+; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0, #16]
+; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0]
 
 define void @pr33475(i8* %p0, i8* %p1) noimplicitfloat {
     call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p0, i8* align 4 %p1, i64 32, i1 false)
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 1dccf0b9905..c410403a9f1 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -44,15 +44,14 @@ entry:
 define void @t2(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: movs [[INC:r[0-9]+]], #32
-; CHECK: add.w   r3, r0, #16
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
 ; CHECK: movw [[REG2:r[0-9]+]], #16716
 ; CHECK: movt [[REG2:r[0-9]+]], #72
 ; CHECK: str [[REG2]], [r0]
-; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
 ; CHECK-T1-LABEL: t2:
 ; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i1 false)
author	Sirish Pande <sirishrp@gmail.com>	2018-05-16 15:36:52 +0000
committer	Sirish Pande <sirishrp@gmail.com>	2018-05-16 15:36:52 +0000
commit	f9deb984807b7d5f76974aa40a35c7377faf56f0 (patch)
tree	7a70251b8f5691f555f18874210ed6f816565265
parent	46a30b98c29af57d5543dd9c3bdbc2c1ab6beaad (diff)