diff options
author | JF Bastien <jfb@google.com> | 2015-08-05 21:04:59 +0000 |
---|---|---|
committer | JF Bastien <jfb@google.com> | 2015-08-05 21:04:59 +0000 |
commit | 8662083770b34a7e62c8a3ae01f6b2e5be417417 (patch) | |
tree | ccd4d0d937fd080842d39d17cff41444e95234d4 /llvm/lib/Target/X86 | |
parent | 8ef3cda3983842531d09e3ef7b445141b626677c (diff) | |
download | bcm5719-llvm-8662083770b34a7e62c8a3ae01f6b2e5be417417.tar.gz bcm5719-llvm-8662083770b34a7e62c8a3ae01f6b2e5be417417.zip |
x86 atomic: optimize a.store(reg op a.load(acquire), release)
Summary: PR24191 finds that the expected memory-register operations aren't generated when relaxed { load ; modify ; store } is used. This is similar to PR17281 which was addressed in D4796, but only for memory-immediate operations (and for memory orderings up to acquire and release). This patch also handles some floating-point operations.
Reviewers: reames, kcc, dvyukov, nadav, morisset, chandlerc, t.p.northover, pete
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D11382
llvm-svn: 244128
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 43 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrCompiler.td | 75 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86MCInstLower.cpp | 12 |
4 files changed, 114 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c9244efdc39..b8f132c28f2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20133,6 +20133,45 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned MSrc = MI->getOperand(0).getReg(); + unsigned VSrc = MI->getOperand(5).getReg(); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addReg(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addImm(0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addReg(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addImm(/*Disp=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -20687,6 +20726,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index edf942121f8..837fe8cfcf7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1080,6 +1080,9 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 49dc3189f09..083e80de7e8 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -752,6 +752,8 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend @@ -759,19 +761,31 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", */ multiclass RELEASE_BINOP_MI<string op> { def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_BINOP PSEUDO!", + "#BINOP "#NAME#"8mi PSEUDO!", [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_8 addr:$dst), GR8:$src))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_BINOP PSEUDO!", + "#BINOP "#NAME#"32mi PSEUDO!", [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_32 addr:$dst), GR32:$src))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_BINOP PSEUDO!", + "#BINOP "#NAME#"64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_64 addr:$dst), GR64:$src))]>; } defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; defm RELEASE_AND : RELEASE_BINOP_MI<"and">; @@ -780,18 +794,41 @@ defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; // Note: we don't deal with sub, because substractions of constants are // optimized into additions before this code can run +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1 in { +multiclass RELEASE_FP_BINOP_MI<string op> { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (!cast<PatFrag>(op) + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (!cast<PatFrag>(op) + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<"fadd">; +// FIXME: Add fsub, fmul, fdiv, ... +} + multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"8m PSEUDO!", [(atomic_store_8 addr:$dst, dag8)]>; def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"16m PSEUDO!", [(atomic_store_16 addr:$dst, dag16)]>; def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"32m PSEUDO!", [(atomic_store_32 addr:$dst, dag32)]>; def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"64m PSEUDO!", [(atomic_store_64 addr:$dst, dag64)]>; } @@ -821,42 +858,42 @@ defm RELEASE_NOT : RELEASE_UNOP< */ def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV8mi PSEUDO!", [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV16mi PSEUDO!", [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV32mi PSEUDO!", [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV8mr PSEUDO!", [(atomic_store_8 addr:$dst, GR8 :$src)]>; def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV16mr PSEUDO!", [(atomic_store_16 addr:$dst, GR16:$src)]>; def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV32mr PSEUDO!", [(atomic_store_32 addr:$dst, GR32:$src)]>; def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV64mr PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV8rm PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV16rm PSEUDO!", [(set GR16:$dst, (atomic_load_16 addr:$src))]>; def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV32rm PSEUDO!", [(set GR32:$dst, (atomic_load_32 addr:$src))]>; def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV64rm PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 68977d38e15..2b30ce6d362 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -598,17 +598,29 @@ ReSimplify: case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; |