5 files changed, 285 insertions, 167 deletions
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index ba21503d868..6e4f35f4c5d 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -20,12 +20,14 @@ ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
+FunctionPass *createBPFMIPeepholeTruncElimPass();
 FunctionPass *createBPFMIPreEmitPeepholePass();
 FunctionPass *createBPFMIPreEmitCheckingPass();
 
 void initializeBPFAbstractMemberAccessPass(PassRegistry&);
 void initializeBPFMISimplifyPatchablePass(PassRegistry&);
 void initializeBPFMIPeepholePass(PassRegistry&);
+void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
 void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
 void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
 }
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 85fa1f2a6be..f2be0ff070d 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -45,9 +45,7 @@ class BPFDAGToDAGISel : public SelectionDAGISel {
 
 public:
   explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
-      : SelectionDAGISel(TM), Subtarget(nullptr) {
-    curr_func_ = nullptr;
-  }
+      : SelectionDAGISel(TM), Subtarget(nullptr) {}
 
   StringRef getPassName() const override {
     return "BPF DAG->DAG Pattern Instruction Selection";
@@ -92,14 +90,8 @@ private:
                           val_vec_type &Vals, int Offset);
   bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
                              uint64_t Size, unsigned char *ByteSeq);
-  bool checkLoadDef(unsigned DefReg, unsigned match_load_op);
-
   // Mapping from ConstantStruct global value to corresponding byte-list values
   std::map<const void *, val_vec_type> cs_vals_;
-  // Mapping from vreg to load memory opcode
-  std::map<unsigned, unsigned> load_to_vreg_;
-  // Current function
-  const Function *curr_func_;
 };
 } // namespace
 
@@ -325,32 +317,13 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
 }
 
 void BPFDAGToDAGISel::PreprocessISelDAG() {
-  // Iterate through all nodes, interested in the following cases:
+  // Iterate through all nodes, interested in the following case:
   //
   //  . loads from ConstantStruct or ConstantArray of constructs
   //    which can be turns into constant itself, with this we can
   //    avoid reading from read-only section at runtime.
   //
-  //  . reg truncating is often the result of 8/16/32bit->64bit or
-  //    8/16bit->32bit conversion. If the reg value is loaded with
-  //    masked byte width, the AND operation can be removed since
-  //    BPF LOAD already has zero extension.
-  //
-  //    This also solved a correctness issue.
-  //    In BPF socket-related program, e.g., __sk_buff->{data, data_end}
-  //    are 32-bit registers, but later on, kernel verifier will rewrite
-  //    it with 64-bit value. Therefore, truncating the value after the
-  //    load will result in incorrect code.
-
-  // clear the load_to_vreg_ map so that we have a clean start
-  // for this function.
-  if (!curr_func_) {
-    curr_func_ = FuncInfo->Fn;
-  } else if (curr_func_ != FuncInfo->Fn) {
-    load_to_vreg_.clear();
-    curr_func_ = FuncInfo->Fn;
-  }
-
+  //  . Removing redundant AND for intrinsic narrow loads.
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
                                        E = CurDAG->allnodes_end();
        I != E;) {
@@ -358,8 +331,6 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
     unsigned Opcode = Node->getOpcode();
     if (Opcode == ISD::LOAD)
       PreprocessLoad(Node, I);
-    else if (Opcode == ISD::CopyToReg)
-      PreprocessCopyToReg(Node);
     else if (Opcode == ISD::AND)
       PreprocessTrunc(Node, I);
   }
@@ -491,36 +462,6 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
   return true;
 }
 
-void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
-  const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1));
-  if (!RegN || !Register::isVirtualRegister(RegN->getReg()))
-    return;
-
-  const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2));
-  if (!LD)
-    return;
-
-  // Assign a load value to a virtual register. record its load width
-  unsigned mem_load_op = 0;
-  switch (LD->getMemOperand()->getSize()) {
-  default:
-    return;
-  case 4:
-    mem_load_op = BPF::LDW;
-    break;
-  case 2:
-    mem_load_op = BPF::LDH;
-    break;
-  case 1:
-    mem_load_op = BPF::LDB;
-    break;
-  }
-
-  LLVM_DEBUG(dbgs() << "Find Load Value to VReg "
-                    << Register::virtReg2Index(RegN->getReg()) << '\n');
-  load_to_vreg_[RegN->getReg()] = mem_load_op;
-}
-
 void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
                                       SelectionDAG::allnodes_iterator &I) {
   ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -534,112 +475,26 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
   // which the generic optimizer doesn't understand their results are
   // zero extended.
   SDValue BaseV = Node->getOperand(0);
-  if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
-    unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
-    uint64_t MaskV = MaskN->getZExtValue();
-
-    if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
-          (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
-          (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
-      return;
-
-    LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
-               Node->dump(); dbgs() << '\n');
-
-    I--;
-    CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
-    I++;
-    CurDAG->DeleteNode(Node);
-
-    return;
-  }
-
-  // Multiple basic blocks case.
-  if (BaseV.getOpcode() != ISD::CopyFromReg)
+  if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return;
 
-  unsigned match_load_op = 0;
-  switch (MaskN->getZExtValue()) {
-  default:
-    return;
-  case 0xFFFFFFFF:
-    match_load_op = BPF::LDW;
-    break;
-  case 0xFFFF:
-    match_load_op = BPF::LDH;
-    break;
-  case 0xFF:
-    match_load_op = BPF::LDB;
-    break;
-  }
+  unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+  uint64_t MaskV = MaskN->getZExtValue();
 
-  const RegisterSDNode *RegN =
-      dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
-  if (!RegN || !Register::isVirtualRegister(RegN->getReg()))
+  if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
+        (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
+        (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
     return;
-  unsigned AndOpReg = RegN->getReg();
-  LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
-
-  // Examine the PHI insns in the MachineBasicBlock to found out the
-  // definitions of this virtual register. At this stage (DAG2DAG
-  // transformation), only PHI machine insns are available in the machine basic
-  // block.
-  MachineBasicBlock *MBB = FuncInfo->MBB;
-  MachineInstr *MII = nullptr;
-  for (auto &MI : *MBB) {
-    for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
-      const MachineOperand &MOP = MI.getOperand(i);
-      if (!MOP.isReg() || !MOP.isDef())
-        continue;
-      Register Reg = MOP.getReg();
-      if (Register::isVirtualRegister(Reg) && Reg == AndOpReg) {
-        MII = &MI;
-        break;
-      }
-    }
-  }
-
-  if (MII == nullptr) {
-    // No phi definition in this block.
-    if (!checkLoadDef(AndOpReg, match_load_op))
-      return;
-  } else {
-    // The PHI node looks like:
-    //   %2 = PHI %0, <%bb.1>, %1, <%bb.3>
-    // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3)
-    // The AND operation can be removed if both %0 in %bb.1 and %1 in
-    // %bb.3 are defined with a load matching the MaskN.
-    LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
-    unsigned PrevReg = -1;
-    for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
-      const MachineOperand &MOP = MII->getOperand(i);
-      if (MOP.isReg()) {
-        if (MOP.isDef())
-          continue;
-        PrevReg = MOP.getReg();
-        if (!Register::isVirtualRegister(PrevReg))
-          return;
-        if (!checkLoadDef(PrevReg, match_load_op))
-          return;
-      }
-    }
-  }
 
-  LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
-             dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
+             Node->dump(); dbgs() << '\n');
 
   I--;
   CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
   I++;
   CurDAG->DeleteNode(Node);
-}
-
-bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) {
-  auto it = load_to_vreg_.find(DefReg);
-  if (it == load_to_vreg_.end())
-    return false; // The definition of register is not exported yet.
 
-  return it->second == match_load_op;
+  return;
 }
 
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index fafd2f703ad..72fd131c459 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -71,7 +71,7 @@ void BPFMIPeephole::initialize(MachineFunction &MFParm) {
   MF = &MFParm;
   MRI = &MF->getRegInfo();
   TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
-  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n");
+  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n");
 }
 
 bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
@@ -186,7 +186,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
 } // end default namespace
 
 INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
-                "BPF MachineSSA Peephole Optimization", false, false)
+                "BPF MachineSSA Peephole Optimization For ZEXT Eliminate",
+                false, false)
 
 char BPFMIPeephole::ID = 0;
 FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
@@ -253,12 +254,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
       // enabled. The special type cast insn MOV_32_64 involves different
       // register class on src (i32) and dst (i64), RA could generate useless
       // instruction due to this.
-      if (MI.getOpcode() == BPF::MOV_32_64) {
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == BPF::MOV_32_64 ||
+          Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) {
         Register dst = MI.getOperand(0).getReg();
-        Register dst_sub = TRI->getSubReg(dst, BPF::sub_32);
         Register src = MI.getOperand(1).getReg();
 
-        if (dst_sub != src)
+        if (Opcode == BPF::MOV_32_64)
+          dst = TRI->getSubReg(dst, BPF::sub_32);
+
+        if (dst != src)
           continue;
 
         ToErase = &MI;
@@ -281,3 +286,177 @@ FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
 {
   return new BPFMIPreEmitPeephole();
 }
+
+STATISTIC(TruncElemNum, "Number of truncation eliminated");
+
+namespace {
+
+struct BPFMIPeepholeTruncElim : public MachineFunctionPass {
+
+  static char ID;
+  const BPFInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) {
+    initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  bool eliminateTruncSeq(void);
+
+public:
+
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
+
+    return eliminateTruncSeq();
+  }
+};
+
+static bool TruncSizeCompatible(int TruncSize, unsigned opcode)
+{
+  if (TruncSize == 1)
+    return opcode == BPF::LDB || opcode == BPF::LDB32;
+
+  if (TruncSize == 2)
+    return opcode == BPF::LDH || opcode == BPF::LDH32;
+
+  if (TruncSize == 4)
+    return opcode == BPF::LDW || opcode == BPF::LDW32;
+
+  return false;
+}
+
+// Initialize class variables.
+void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n");
+}
+
+// Reg truncating is often the result of 8/16/32bit->64bit or
+// 8/16bit->32bit conversion. If the reg value is loaded with
+// masked byte width, the AND operation can be removed since
+// BPF LOAD already has zero extension.
+//
+// This also solved a correctness issue.
+// In BPF socket-related program, e.g., __sk_buff->{data, data_end}
+// are 32-bit registers, but later on, kernel verifier will rewrite
+// it with 64-bit value. Therefore, truncating the value after the
+// load will result in incorrect code.
+bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // The second insn to remove if the eliminate candidate is a pair.
+      MachineInstr *MI2 = nullptr;
+      Register DstReg, SrcReg;
+      MachineInstr *DefMI;
+      int TruncSize = -1;
+
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate
+      // for BPF ANDI is i32, and this case only happens on ALU64.
+      if (MI.getOpcode() == BPF::SRL_ri &&
+          MI.getOperand(2).getImm() == 32) {
+        SrcReg = MI.getOperand(1).getReg();
+        MI2 = MRI->getVRegDef(SrcReg);
+        DstReg = MI.getOperand(0).getReg();
+
+        if (!MI2 ||
+            MI2->getOpcode() != BPF::SLL_ri ||
+            MI2->getOperand(2).getImm() != 32)
+          continue;
+
+        // Update SrcReg.
+        SrcReg = MI2->getOperand(1).getReg();
+        DefMI = MRI->getVRegDef(SrcReg);
+        if (DefMI)
+          TruncSize = 4;
+      } else if (MI.getOpcode() == BPF::AND_ri ||
+                 MI.getOpcode() == BPF::AND_ri_32) {
+        SrcReg = MI.getOperand(1).getReg();
+        DstReg = MI.getOperand(0).getReg();
+        DefMI = MRI->getVRegDef(SrcReg);
+
+        if (!DefMI)
+          continue;
+
+        int64_t imm = MI.getOperand(2).getImm();
+        if (imm == 0xff)
+          TruncSize = 1;
+        else if (imm == 0xffff)
+          TruncSize = 2;
+      }
+
+      if (TruncSize == -1)
+        continue;
+
+      // The definition is PHI node, check all inputs.
+      if (DefMI->isPHI()) {
+        bool CheckFail = false;
+
+        for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+          MachineOperand &opnd = DefMI->getOperand(i);
+          if (!opnd.isReg()) {
+            CheckFail = true;
+            break;
+          }
+
+          MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+          if (!PhiDef || PhiDef->isPHI() ||
+              !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) {
+            CheckFail = true;
+            break;
+          }
+        }
+
+        if (CheckFail)
+          continue;
+      } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) {
+        continue;
+      }
+
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg)
+              .addReg(SrcReg);
+
+      if (MI2)
+        MI2->eraseFromParent();
+
+      // Mark it to ToErase, and erase in the next iteration.
+      ToErase = &MI;
+      TruncElemNum++;
+      Eliminated = true;
+    }
+  }
+
+  return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim",
+                "BPF MachineSSA Peephole Optimization For TRUNC Eliminate",
+                false, false)
+
+char BPFMIPeepholeTruncElim::ID = 0;
+FunctionPass* llvm::createBPFMIPeepholeTruncElimPass()
+{
+  return new BPFMIPeepholeTruncElim();
+}
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index d940ac965c0..0c4f2c74e7a 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -36,6 +36,7 @@ extern "C" void LLVMInitializeBPFTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeBPFAbstractMemberAccessPass(PR);
   initializeBPFMIPeepholePass(PR);
+  initializeBPFMIPeepholeTruncElimPass(PR);
 }
 
 // DataLayout: little or big endian
@@ -115,15 +116,16 @@ void BPFPassConfig::addMachineSSAOptimization() {
   TargetPassConfig::addMachineSSAOptimization();
 
   const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
-  if (Subtarget->getHasAlu32() && !DisableMIPeephole)
-    addPass(createBPFMIPeepholePass());
+  if (!DisableMIPeephole) {
+    if (Subtarget->getHasAlu32())
+      addPass(createBPFMIPeepholePass());
+    addPass(createBPFMIPeepholeTruncElimPass());
+  }
 }
 
 void BPFPassConfig::addPreEmitPass() {
-  const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
-
   addPass(createBPFMIPreEmitCheckingPass());
   if (getOptLevel() != CodeGenOpt::None)
-    if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+    if (!DisableMIPeephole)
       addPass(createBPFMIPreEmitPeepholePass());
 }
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_6.ll b/llvm/test/CodeGen/BPF/remove_truncate_6.ll
new file mode 100644
index 00000000000..6577afb13ba
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/remove_truncate_6.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=bpf -mattr=+alu32 -verify-machineinstrs | FileCheck --check-prefix=CHECK-32 %s
+;
+; void cal1(unsigned short *a, unsigned long *b, unsigned int k)
+; {
+;   unsigned short e;
+;
+;   e = *a;
+;   for (unsigned int i = 0; i < k; i++) {
+;     b[i] = e;
+;     e = ~e;
+;   }
+; }
+;
+; void cal2(unsigned short *a, unsigned int *b, unsigned int k)
+; {
+;   unsigned short e;
+;
+;   e = *a;
+;   for (unsigned int i = 0; i < k; i++) {
+;     b[i] = e;
+;     e = ~e;
+;   }
+; }
+
+; Function Attrs: nofree norecurse nounwind optsize
+define dso_local void @cal1(i16* nocapture readonly %a, i64* nocapture %b, i32 %k) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp eq i32 %k, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = load i16, i16* %a, align 2
+  %wide.trip.count = zext i32 %k to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ]
+  %conv = zext i16 %e.09 to i64
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv
+; CHECK: r{{[0-9]+}} &= 65535
+; CHECK-32: r{{[0-9]+}} &= 65535
+  store i64 %conv, i64* %arrayidx, align 8
+  %neg = xor i16 %e.09, -1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: nofree norecurse nounwind optsize
+define dso_local void @cal2(i16* nocapture readonly %a, i32* nocapture %b, i32 %k) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp eq i32 %k, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = load i16, i16* %a, align 2
+  %wide.trip.count = zext i32 %k to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ]
+  %conv = zext i16 %e.09 to i32
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+; CHECK: r{{[0-9]+}} &= 65535
+; CHECK-32: w{{[0-9]+}} &= 65535
+  store i32 %conv, i32* %arrayidx, align 4
+  %neg = xor i16 %e.09, -1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}