12 files changed, 540 insertions, 100 deletions
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 2cfa719d73b..42844562020 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -997,7 +997,7 @@ bool X86FastISel::X86SelectTrunc(Instruction *I) {
     return false;
 
   // First issue a copy to GR16_ or GR32_.
-  unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16to16_ : X86::MOV32to32_;
+  unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16rr : X86::MOV32rr;
   const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
     ? X86::GR16_RegisterClass : X86::GR32_RegisterClass;
   unsigned CopyReg = createResultReg(CopyRC);
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 6fd9d00e661..41a3c416f85 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1019,21 +1019,69 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
     break;
       
   case ISD::AND: {
-    // Handle "(x << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
-    // allows us to fold the shift into this addressing mode.
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
     SDValue Shift = N.getOperand(0);
-    if (Shift.getOpcode() != ISD::SHL) break;
+    if (Shift.getNumOperands() != 2) break;
 
     // Scale must not be used already.
     if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
 
     // Not when RIP is used as the base.
     if (AM.isRIPRel) break;
-      
+
+    SDValue X = Shift.getOperand(0);
     ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
     ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
     if (!C1 || !C2) break;
 
+    // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
+    // allows us to convert the shift and and into an h-register extract and
+    // a scaled index.
+    if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
+      unsigned ScaleLog = 8 - C1->getZExtValue();
+      if (ScaleLog > 0 && ScaleLog < 64 &&
+          C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
+        SDValue Eight = CurDAG->getConstant(8, MVT::i8);
+        SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
+        SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+                                      X, Eight);
+        SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
+                                      Srl, Mask);
+
+        // Insert the new nodes into the topological ordering.
+        if (Eight.getNode()->getNodeId() == -1 ||
+            Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Eight.getNode());
+          Eight.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Mask.getNode()->getNodeId() == -1 ||
+            Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Mask.getNode());
+          Mask.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Srl.getNode()->getNodeId() == -1 ||
+            Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
+          Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
+        }
+        if (And.getNode()->getNodeId() == -1 ||
+            And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(N.getNode(), And.getNode());
+          And.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        CurDAG->ReplaceAllUsesWith(N, And);
+        AM.IndexReg = And;
+        AM.Scale = (1 << ScaleLog);
+        return false;
+      }
+    }
+
+    // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+    // allows us to fold the shift into this addressing mode.
+    if (Shift.getOpcode() != ISD::SHL) break;
+
     // Not likely to be profitable if either the AND or SHIFT node has more
     // than one use (unless all uses are for address computation). Besides,
     // isel mechanism requires their node ids to be reused.
@@ -1046,7 +1094,6 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
       break;
     
     // Get the new AND mask, this folds to a constant.
-    SDValue X = Shift.getOperand(0);
     SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
                                          SDValue(C2, 0), SDValue(C1, 0));
     SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, 
diff --git a/llvm/lib/Target/X86/X86Instr64bit.td b/llvm/lib/Target/X86/X86Instr64bit.td
index 10e66e88bee..05bccabc304 100644
--- a/llvm/lib/Target/X86/X86Instr64bit.td
+++ b/llvm/lib/Target/X86/X86Instr64bit.td
@@ -1522,7 +1522,7 @@ def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
 
 // r & (2^32-1) ==> movz
 def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
-          (MOVZX64rr32 (i32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)))>;
+          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
 // r & (2^16-1) ==> movz
 def : Pat<(and GR64:$src, 0xffff),
           (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
@@ -1531,7 +1531,7 @@ def : Pat<(and GR64:$src, 0xff),
           (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
-           (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit)))>,
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit))>,
       Requires<[In64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
@@ -1540,13 +1540,13 @@ def : Pat<(and GR16:$src1, 0xff),
 
 // sext_inreg patterns
 def : Pat<(sext_inreg GR64:$src, i32),
-          (MOVSX64rr32 (i32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)))>;
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
 def : Pat<(sext_inreg GR64:$src, i16),
-          (MOVSX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
 def : Pat<(sext_inreg GR64:$src, i8),
-          (MOVSX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
 def : Pat<(sext_inreg GR32:$src, i8),
-          (MOVSX32rr8 (i8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)))>,
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
       Requires<[In64BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
           (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)))>,
@@ -1554,16 +1554,63 @@ def : Pat<(sext_inreg GR16:$src, i8),
 
 // trunc patterns
 def : Pat<(i32 (trunc GR64:$src)),
-          (i32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)>;
 def : Pat<(i16 (trunc GR64:$src)),
-          (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)>;
 def : Pat<(i8 (trunc GR64:$src)),
-          (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)>;
 def : Pat<(i8 (trunc GR32:$src)),
-          (i8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)>,
       Requires<[In64BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
-          (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit))>,
+          (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative and only the extract if the value is immediately
+// zero-extended or stored, which are somewhat common cases. This uses a bunch
+// of code to prevent a register requiring a REX prefix from being allocated in
+// the same instruction as the h register, as there's currently no way to
+// describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR64:$src, GR64_),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR64:$src, GR64_),
+                            x86_subreg_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                            x86_subreg_8bit_hi))>,
       Requires<[In64BitMode]>;
 
 // (shl x, 1) ==> (add x, x)
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 77320587cb4..77955a6a426 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -258,10 +258,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::JMP64r,      X86::JMP64m, 1 },
     { X86::MOV16ri,     X86::MOV16mi, 0 },
     { X86::MOV16rr,     X86::MOV16mr, 0 },
-    { X86::MOV16to16_,  X86::MOV16_mr, 0 },
     { X86::MOV32ri,     X86::MOV32mi, 0 },
     { X86::MOV32rr,     X86::MOV32mr, 0 },
-    { X86::MOV32to32_,  X86::MOV32_mr, 0 },
     { X86::MOV64ri32,   X86::MOV64mi32, 0 },
     { X86::MOV64rr,     X86::MOV64mr, 0 },
     { X86::MOV8ri,      X86::MOV8mi, 0 },
@@ -372,9 +370,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
     { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
     { X86::MOV16rr,         X86::MOV16rm },
-    { X86::MOV16to16_,      X86::MOV16_rm },
     { X86::MOV32rr,         X86::MOV32rm },
-    { X86::MOV32to32_,      X86::MOV32_rm },
     { X86::MOV64rr,         X86::MOV64rm },
     { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm },
     { X86::MOV64toSDrr,     X86::MOV64toSDrm },
@@ -404,6 +400,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm },
     { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
     { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8 },
     { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
     { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
     { X86::MOVZX64rr32,     X86::MOVZX64rm32 },
@@ -672,8 +669,6 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
   case X86::MOV16rr:
   case X86::MOV32rr: 
   case X86::MOV64rr:
-  case X86::MOV16to16_:
-  case X86::MOV32to32_:
   case X86::MOVSSrr:
   case X86::MOVSDrr:
 
@@ -710,9 +705,7 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   default: break;
   case X86::MOV8rm:
   case X86::MOV16rm:
-  case X86::MOV16_rm:
   case X86::MOV32rm:
-  case X86::MOV32_rm:
   case X86::MOV64rm:
   case X86::LD_Fp64m:
   case X86::MOVSSrm:
@@ -741,9 +734,7 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   default: break;
   case X86::MOV8mr:
   case X86::MOV16mr:
-  case X86::MOV16_mr:
   case X86::MOV32mr:
-  case X86::MOV32_mr:
   case X86::MOV64mr:
   case X86::ST_FpP64m:
   case X86::MOVSSmr:
@@ -795,9 +786,7 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
   default: break;
     case X86::MOV8rm:
     case X86::MOV16rm:
-    case X86::MOV16_rm:
     case X86::MOV32rm:
-    case X86::MOV32_rm:
     case X86::MOV64rm:
     case X86::LD_Fp64m:
     case X86::MOVSSrm:
@@ -1670,10 +1659,22 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
       Opc = X86::MOV16rr;
     } else if (DestRC == &X86::GR8RegClass) {
       Opc = X86::MOV8rr;
+    } else if (DestRC == &X86::GR64_RegClass) {
+      Opc = X86::MOV64rr;
     } else if (DestRC == &X86::GR32_RegClass) {
-      Opc = X86::MOV32_rr;
+      Opc = X86::MOV32rr;
     } else if (DestRC == &X86::GR16_RegClass) {
-      Opc = X86::MOV16_rr;
+      Opc = X86::MOV16rr;
+    } else if (DestRC == &X86::GR8_RegClass) {
+      Opc = X86::MOV8rr;
+    } else if (DestRC == &X86::GR64_NOREXRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (DestRC == &X86::GR32_NOREXRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (DestRC == &X86::GR16_NOREXRegClass) {
+      Opc = X86::MOV16rr;
+    } else if (DestRC == &X86::GR8_NOREXRegClass) {
+      Opc = X86::MOV8rr;
     } else if (DestRC == &X86::RFP32RegClass) {
       Opc = X86::MOV_Fp3232;
     } else if (DestRC == &X86::RFP64RegClass || DestRC == &X86::RSTRegClass) {
@@ -1721,7 +1722,7 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
       return true;
     }
   }
-  
+
   // Moving from ST(0) turns into FpGET_ST0_32 etc.
   if (SrcRC == &X86::RSTRegClass) {
     // Copying from ST(0)/ST(1).
@@ -1779,10 +1780,22 @@ static unsigned getStoreRegOpcode(const TargetRegisterClass *RC,
     Opc = X86::MOV16mr;
   } else if (RC == &X86::GR8RegClass) {
     Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_RegClass) {
+    Opc = X86::MOV64mr;
   } else if (RC == &X86::GR32_RegClass) {
-    Opc = X86::MOV32_mr;
+    Opc = X86::MOV32mr;
   } else if (RC == &X86::GR16_RegClass) {
-    Opc = X86::MOV16_mr;
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_RegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_NOREXRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8mr;
   } else if (RC == &X86::RFP80RegClass) {
     Opc = X86::ST_FpP80m;   // pops
   } else if (RC == &X86::RFP64RegClass) {
@@ -1847,10 +1860,22 @@ static unsigned getLoadRegOpcode(const TargetRegisterClass *RC,
     Opc = X86::MOV16rm;
   } else if (RC == &X86::GR8RegClass) {
     Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_RegClass) {
+    Opc = X86::MOV64rm;
   } else if (RC == &X86::GR32_RegClass) {
-    Opc = X86::MOV32_rm;
+    Opc = X86::MOV32rm;
   } else if (RC == &X86::GR16_RegClass) {
-    Opc = X86::MOV16_rm;
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_RegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_NOREXRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8rm;
   } else if (RC == &X86::RFP80RegClass) {
     Opc = X86::LD_Fp80m;
   } else if (RC == &X86::RFP64RegClass) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index bef6b72c801..830796e3a38 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -181,6 +181,13 @@ def f64mem  : X86MemOperand<"printf64mem">;
 def f80mem  : X86MemOperand<"printf80mem">;
 def f128mem : X86MemOperand<"printf128mem">;
 
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX, i32imm, i8imm);
+}
+
 def lea32mem : Operand<i32> {
   let PrintMethod = "printlea32mem";
   let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
@@ -398,6 +405,14 @@ def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
 def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
   return N->hasOneUse();
 }]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
 
 // 'shld' and 'shrd' instruction patterns. Note that even though these have
 // the srl and shl in their patterns, the C++ code must still check for them,
@@ -767,7 +782,12 @@ def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
 def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
                 [(store GR32:$src, addr:$dst)]>;
-                
+
+// A version of MOV8mr that uses i8mem_NOREX so that it can be used for
+// storing h registers, which can't be encoded when a REX prefix is present.
+def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+
 //===----------------------------------------------------------------------===//
 //  Fixed-Register Multiplication and Division Instructions...
 //
@@ -2899,6 +2919,18 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movz{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
 
+// These are the same as the regular regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+
 let neverHasSideEffects = 1 in {
   let Defs = [AX], Uses = [AL] in
   def CBW : I<0x98, RawFrm, (outs), (ins),
@@ -2935,33 +2967,6 @@ def MOV32r0  : I<0x31, MRMInitReg,  (outs GR32:$dst), (ins),
                  [(set GR32:$dst, 0)]>;
 }
 
-// Basic operations on GR16 / GR32 subclasses GR16_ and GR32_ which contains only
-// those registers that have GR8 sub-registers (i.e. AX - DX, EAX - EDX).
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1 in {
-def MOV16to16_ : I<0x89, MRMDestReg, (outs GR16_:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32to32_ : I<0x89, MRMDestReg, (outs GR32_:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-                
-def MOV16_rr : I<0x89, MRMDestReg, (outs GR16_:$dst), (ins GR16_:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32_rr : I<0x89, MRMDestReg, (outs GR32_:$dst), (ins GR32_:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-} // neverHasSideEffects
-
-let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
-def MOV16_rm : I<0x8B, MRMSrcMem, (outs GR16_:$dst), (ins i16mem:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32_rm : I<0x8B, MRMSrcMem, (outs GR32_:$dst), (ins i32mem:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-}
-let mayStore = 1, neverHasSideEffects = 1 in {
-def MOV16_mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16_:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32_mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32_:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-}
-
 //===----------------------------------------------------------------------===//
 // Thread Local Storage Instructions
 //
@@ -3341,38 +3346,61 @@ def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
 
 // r & (2^16-1) ==> movz
 def : Pat<(and GR32:$src1, 0xffff),
-          (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit)))>;
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit))>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
-          (MOVZX32rr8 (i8 (EXTRACT_SUBREG (MOV32to32_ GR32:$src1),
-                                          x86_subreg_8bit)))>,
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src1, GR32_),
+                                      x86_subreg_8bit))>,
       Requires<[In32BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
-          (MOVZX16rr8 (i8 (EXTRACT_SUBREG (MOV16to16_ GR16:$src1),
-                                          x86_subreg_8bit)))>,
+          (MOVZX16rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src1, GR16_),
+                                      x86_subreg_8bit))>,
       Requires<[In32BitMode]>;
 
 // sext_inreg patterns
 def : Pat<(sext_inreg GR32:$src, i16),
-          (MOVSX32rr16 (i16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)))>;
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
 def : Pat<(sext_inreg GR32:$src, i8),
-          (MOVSX32rr8 (i8 (EXTRACT_SUBREG (MOV32to32_ GR32:$src),
-                                          x86_subreg_8bit)))>,
+          (MOVSX32rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                                      x86_subreg_8bit))>,
       Requires<[In32BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
-          (MOVSX16rr8 (i8 (EXTRACT_SUBREG (MOV16to16_ GR16:$src),
-                                          x86_subreg_8bit)))>,
+          (MOVSX16rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                                      x86_subreg_8bit))>,
       Requires<[In32BitMode]>;
 
 // trunc patterns
 def : Pat<(i16 (trunc GR32:$src)),
-          (i16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)>;
 def : Pat<(i8 (trunc GR32:$src)),
-          (i8 (EXTRACT_SUBREG (MOV32to32_ GR32:$src), x86_subreg_8bit))>,
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                          x86_subreg_8bit)>,
       Requires<[In32BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
-          (i8 (EXTRACT_SUBREG (MOV16to16_ GR16:$src), x86_subreg_8bit))>,
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                          x86_subreg_8bit)>,
+      Requires<[In32BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                                      x86_subreg_8bit_hi))>,
       Requires<[In32BitMode]>;
 
 // (shl x, 1) ==> (add x, x)
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 4856e2346de..33b9f5edc73 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -35,7 +35,7 @@ namespace X86 {
   /// these indices must be kept in sync with the class indices in the 
   /// X86RegisterInfo.td file.
   enum SubregIndex {
-    SUBREG_8BIT = 1, SUBREG_16BIT = 2, SUBREG_32BIT = 3
+    SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
   };
 }
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index a7b0f88963b..b323e78cfab 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -49,7 +49,8 @@ let Namespace = "X86" in {
   def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
   def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
 
-  // High registers X86-32 only
+  // High registers. On x86-64, these cannot be used in any instruction
+  // with a REX prefix.
   def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
   def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
   def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
@@ -185,41 +186,45 @@ let Namespace = "X86" in {
 //
 
 def x86_subreg_8bit    : PatLeaf<(i32 1)>;
-def x86_subreg_16bit   : PatLeaf<(i32 2)>;
-def x86_subreg_32bit   : PatLeaf<(i32 3)>;
+def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
+def x86_subreg_16bit   : PatLeaf<(i32 3)>;
+def x86_subreg_32bit   : PatLeaf<(i32 4)>;
 
 def : SubRegSet<1, [AX, CX, DX, BX, SP,  BP,  SI,  DI,  
                     R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
                    [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
                     R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
 
-// It's unclear if this subreg set is safe, given that not all registers
-// in the class have an 'H' subreg.
-// def : SubRegSet<2, [AX, CX, DX, BX],
-//                    [AH, CH, DH, BH]>;
+def : SubRegSet<2, [AX, CX, DX, BX],
+                   [AH, CH, DH, BH]>;
 
 def : SubRegSet<1, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,  
                     R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
                    [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
                     R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
 
-def : SubRegSet<2, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,  
+def : SubRegSet<2, [EAX, ECX, EDX, EBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
                     R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
                    [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
                     R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
 
-
 def : SubRegSet<1, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
                     R8,  R9,  R10, R11, R12, R13, R14, R15],
                    [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
                     R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
 
-def : SubRegSet<2, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
+def : SubRegSet<2, [RAX, RCX, RDX, RBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
                     R8,  R9,  R10, R11, R12, R13, R14, R15],
                    [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
                     R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
-                    
-def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
+
+def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
                     R8,  R9,  R10, R11, R12, R13, R14, R15],
                    [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI, 
                     R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
@@ -236,7 +241,11 @@ def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
 // R8B, ... R15B. 
 // Allocate R12 and R13 last, as these require an extra byte when
 // encoded in x86_64 instructions.
-// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions,
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
 def GR8 : RegisterClass<"X86", [i8],  8,
                         [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
                          R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
@@ -295,7 +304,7 @@ def GR8 : RegisterClass<"X86", [i8],  8,
 def GR16 : RegisterClass<"X86", [i16], 16,
                          [AX, CX, DX, SI, DI, BX, BP, SP,
                           R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
-  let SubRegClassList = [GR8];
+  let SubRegClassList = [GR8, GR8];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -363,7 +372,7 @@ def GR16 : RegisterClass<"X86", [i16], 16,
 def GR32 : RegisterClass<"X86", [i32], 32, 
                          [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
                           R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
-  let SubRegClassList = [GR8, GR16];
+  let SubRegClassList = [GR8, GR8, GR16];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -431,7 +440,7 @@ def GR32 : RegisterClass<"X86", [i32], 32,
 def GR64 : RegisterClass<"X86", [i64], 64, 
                          [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
                           RBX, R14, R15, R12, R13, RBP, RSP]> {
-  let SubRegClassList = [GR8, GR16, GR32];
+  let SubRegClassList = [GR8, GR8, GR16, GR32];
   let MethodProtos = [{
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
@@ -452,13 +461,118 @@ def GR64 : RegisterClass<"X86", [i64], 64,
 }
 
 
-// GR16, GR32 subclasses which contain registers that have GR8 sub-registers.
-// These should only be used for 32-bit mode.
+// GR8_, GR16_, GR32_, GR64_ - Subclasses of GR8, GR16, GR32, and GR64
+// which contain just the "a" "b", "c", and "d" registers. On x86-32,
+// GR16_ and GR32_ are classes for registers that support 8-bit subreg
+// operations. On x86-64, GR16_, GR32_, and GR64_ are classes for registers
+// that support 8-bit h-register operations.
+def GR8_ : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]> {
+}
 def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
-  let SubRegClassList = [GR8];
+  let SubRegClassList = [GR8_, GR8_];
 }
 def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
-  let SubRegClassList = [GR8, GR16];
+  let SubRegClassList = [GR8_, GR8_, GR16_];
+}
+def GR64_ : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+  let SubRegClassList = [GR8_, GR8_, GR16_, GR32_];
+}
+
+// GR8_NOREX, GR16_NOREX, GR32_NOREX, GR64_NOREX - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain only the first 8 GPRs.
+// On x86-64, GR64_NOREX, GR32_NOREX and GR16_NOREX are the classes
+// of registers which do not by themselves require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              [AL, CL, DL, SIL, DIL, BL, BPL, SPL]> {
+}
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               [AX, CX, DX, SI, DI, BX, BP, SP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX];
+}
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate ESP or EBP.
+    static const unsigned X86_GR32_NOREX_AO_fp[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
+    };
+    // If not, just don't allocate ESP.
+    static const unsigned X86_GR32_NOREX_AO[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
+    };
+
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR32_NOREX_AO_fp;
+      else
+        return X86_GR32_NOREX_AO;
+    }
+
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR32_NOREX_AO_fp +
+               (sizeof(X86_GR32_NOREX_AO_fp) / sizeof(unsigned));
+      else
+        return X86_GR32_NOREX_AO +
+               (sizeof(X86_GR32_NOREX_AO) / sizeof(unsigned));
+    }
+  }];
+}
+
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate RSP or RBP.
+    static const unsigned X86_GR64_NOREX_AO_fp[] = {
+      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX
+    };
+    // If not, just don't allocate RSP.
+    static const unsigned X86_GR64_NOREX_AO[] = {
+      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX, X86::RBP
+    };
+
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR64_NOREX_AO_fp;
+      else
+        return X86_GR64_NOREX_AO;
+    }
+
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR64_NOREX_AO_fp +
+               (sizeof(X86_GR64_NOREX_AO_fp) / sizeof(unsigned));
+      else
+        return X86_GR64_NOREX_AO +
+               (sizeof(X86_GR64_NOREX_AO) / sizeof(unsigned));
+    }
+  }];
 }
 
 // A class to support the 'A' assembler constraint: EAX then EDX.
diff --git a/llvm/test/CodeGen/X86/h-register-addressing-32.ll b/llvm/test/CodeGen/X86/h-register-addressing-32.ll
new file mode 100644
index 00000000000..41d91285ddb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/h-register-addressing-32.ll
@@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=x86 | grep {movzbl	%\[abcd\]h,} | count 7
+
+; Use h-register extract and zero-extend.
+
+define double @foo8(double* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr double* %p, i32 %t1
+  %t3 = load double* %t2, align 8
+  ret double %t3
+}
+define float @foo4(float* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr float* %p, i32 %t1
+  %t3 = load float* %t2, align 8
+  ret float %t3
+}
+define i16 @foo2(i16* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr i16* %p, i32 %t1
+  %t3 = load i16* %t2, align 8
+  ret i16 %t3
+}
+define i8 @foo1(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar8(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 5
+  %t1 = and i32 %t0, 2040
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar4(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 6
+  %t1 = and i32 %t0, 1020
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar2(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 7
+  %t1 = and i32 %t0, 510
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
diff --git a/llvm/test/CodeGen/X86/h-register-addressing-64.ll b/llvm/test/CodeGen/X86/h-register-addressing-64.ll
new file mode 100644
index 00000000000..b38e0e478e9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/h-register-addressing-64.ll
@@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 7
+
+; Use h-register extract and zero-extend.
+
+define double @foo8(double* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr double* %p, i64 %t1
+  %t3 = load double* %t2, align 8
+  ret double %t3
+}
+define float @foo4(float* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr float* %p, i64 %t1
+  %t3 = load float* %t2, align 8
+  ret float %t3
+}
+define i16 @foo2(i16* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr i16* %p, i64 %t1
+  %t3 = load i16* %t2, align 8
+  ret i16 %t3
+}
+define i8 @foo1(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar8(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 5
+  %t1 = and i64 %t0, 2040
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar4(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 6
+  %t1 = and i64 %t0, 1020
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar2(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 7
+  %t1 = and i64 %t0, 510
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
diff --git a/llvm/test/CodeGen/X86/h-register-store.ll b/llvm/test/CodeGen/X86/h-register-store.ll
new file mode 100644
index 00000000000..e8672422a7b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/h-register-store.ll
@@ -0,0 +1,27 @@
+; RUN: llvm-as < %s | llc -march=x86-64 > %t
+; RUN: grep mov %t | count 6
+; RUN: grep {movb	%ah, (%rsi)} %t | count 3
+; RUN: llvm-as < %s | llc -march=x86 > %t
+; RUN: grep mov %t | count 3
+; RUN: grep {movb	%ah, (%e} %t | count 3
+
+; Use h-register extract and store.
+
+define void @foo16(i16 inreg %p, i8* inreg %z) nounwind {
+  %q = lshr i16 %p, 8
+  %t = trunc i16 %q to i8
+  store i8 %t, i8* %z
+  ret void
+}
+define void @foo32(i32 inreg %p, i8* inreg %z) nounwind {
+  %q = lshr i32 %p, 8
+  %t = trunc i32 %q to i8
+  store i8 %t, i8* %z
+  ret void
+}
+define void @foo64(i64 inreg %p, i8* inreg %z) nounwind {
+  %q = lshr i64 %p, 8
+  %t = trunc i64 %q to i8
+  store i8 %t, i8* %z
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/h-registers.ll b/llvm/test/CodeGen/X86/h-registers.ll
new file mode 100644
index 00000000000..2777be9cc3e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/h-registers.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 4
+; RUN: llvm-as < %s | llc -march=x86    > %t
+; RUN: grep {incb	%ah} %t | count 3
+; RUN: grep {movzbl	%ah,} %t | count 3
+
+; Use h registers. On x86-64, codegen doesn't support general allocation
+; of h registers yet, due to x86 encoding complications.
+
+define void @bar64(i64 inreg %x, i8* inreg %p) nounwind {
+  %t0 = lshr i64 %x, 8
+  %t1 = trunc i64 %t0 to i8
+  %t2 = add i8 %t1, 1
+  store i8 %t2, i8* %p
+  ret void
+}
+
+define void @bar32(i32 inreg %x, i8* inreg %p) nounwind {
+  %t0 = lshr i32 %x, 8
+  %t1 = trunc i32 %t0 to i8
+  %t2 = add i8 %t1, 1
+  store i8 %t2, i8* %p
+  ret void
+}
+
+define void @bar16(i16 inreg %x, i8* inreg %p) nounwind {
+  %t0 = lshr i16 %x, 8
+  %t1 = trunc i16 %t0 to i8
+  %t2 = add i8 %t1, 1
+  store i8 %t2, i8* %p
+  ret void
+}
+
+define i64 @qux64(i64 inreg %x) nounwind {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  ret i64 %t1
+}
+
+define i32 @qux32(i32 inreg %x) nounwind {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  ret i32 %t1
+}
+
+define i16 @qux16(i16 inreg %x) nounwind {
+  %t0 = lshr i16 %x, 8
+  ret i16 %t0
+}
diff --git a/llvm/test/CodeGen/X86/inline-asm-out-regs.ll b/llvm/test/CodeGen/X86/inline-asm-out-regs.ll
index 3a84bad94d0..01f1397830a 100644
--- a/llvm/test/CodeGen/X86/inline-asm-out-regs.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-out-regs.ll
@@ -1,6 +1,4 @@
 ; RUN: llvm-as < %s | llc -mtriple=i386-unknown-linux-gnu
-; XFAIL: *
-; Expected to run out of registers during allocation.
 ; PR3391
 
 @pci_indirect = external global { }             ; <{ }*> [#uses=1]