Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT ARM64-SVE: Add AddAcross #101674

Merged
merged 5 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/coreclr/jit/codegenarm64test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5314,11 +5314,11 @@ void CodeGen::genArm64EmitterUnitTestsSve()
#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED

// IF_SVE_AI_3A
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_1BYTE, REG_V1, REG_P4, REG_V2,
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_SCALABLE, REG_V1, REG_P4, REG_V2,
Copy link
Contributor Author

@a74nh a74nh Apr 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the codegen changes:

For these instructions, arg2 (EA_1BYTE etc) is never used as the return value is dependent on the input type which is already specified in opt.
Switching arg2 to EA_SCALABLE means there is no need to write special hwinstrinsiccodegen code.

I've changed the bare minimal of instructions needed to make this patch work. There are quite a few more reduction like instructions - we should do those as we get to them in the API

INS_OPTS_SCALABLE_B); // SADDV <Dd>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_2BYTE, REG_V2, REG_P5, REG_V3,
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_SCALABLE, REG_V2, REG_P5, REG_V3,
INS_OPTS_SCALABLE_H); // SADDV <Dd>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_uaddv, EA_4BYTE, REG_V3, REG_P6, REG_V4,
theEmitter->emitIns_R_R_R(INS_sve_uaddv, EA_SCALABLE, REG_V3, REG_P6, REG_V4,
INS_OPTS_SCALABLE_S); // UADDV <Dd>, <Pg>, <Zn>.<T>

// IF_SVE_AJ_3A
Expand Down Expand Up @@ -6768,15 +6768,15 @@ void CodeGen::genArm64EmitterUnitTestsSve()
#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED

// IF_SVE_HE_3A
theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_2BYTE, REG_V21, REG_P7, REG_V7,
theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_SCALABLE, REG_V21, REG_P7, REG_V7,
INS_OPTS_SCALABLE_H); // FADDV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_2BYTE, REG_V22, REG_P6, REG_V6,
theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_SCALABLE, REG_V22, REG_P6, REG_V6,
INS_OPTS_SCALABLE_H); // FMAXNMV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_4BYTE, REG_V23, REG_P5, REG_V5,
theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_SCALABLE, REG_V23, REG_P5, REG_V5,
INS_OPTS_SCALABLE_S); // FMAXV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_8BYTE, REG_V24, REG_P4, REG_V4,
theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_SCALABLE, REG_V24, REG_P4, REG_V4,
INS_OPTS_SCALABLE_D); // FMINNMV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_4BYTE, REG_V25, REG_P3, REG_V3,
theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_SCALABLE, REG_V25, REG_P3, REG_V3,
INS_OPTS_SCALABLE_S); // FMINV <V><d>, <Pg>, <Zn>.<T>

// IF_SVE_HQ_3A
Expand Down
29 changes: 23 additions & 6 deletions src/coreclr/jit/emitarm64sve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3060,7 +3060,6 @@ void emitter::emitInsSve_R_R_R(instruction ins,
break;

case INS_sve_saddv:
case INS_sve_uaddv:
assert(isFloatReg(reg1));
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
Expand All @@ -3069,6 +3068,15 @@ void emitter::emitInsSve_R_R_R(instruction ins,
fmt = IF_SVE_AI_3A;
break;

case INS_sve_uaddv:
assert(isFloatReg(reg1));
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
assert(insOptsScalableStandard(opt));
assert(insScalableOptsNone(sopt));
fmt = IF_SVE_AI_3A;
break;

case INS_sve_addqv:
unreached(); // TODO-SVE: Not yet supported.
assert(isVectorRegister(reg1));
Expand Down Expand Up @@ -4059,7 +4067,7 @@ void emitter::emitInsSve_R_R_R(instruction ins,
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
assert(insOptsScalableFloat(opt));
assert(isValidVectorElemsizeSveFloat(size));
assert(isScalableVectorSize(size));
assert(insScalableOptsNone(sopt));
fmt = IF_SVE_HE_3A;
break;
Expand All @@ -4069,7 +4077,7 @@ void emitter::emitInsSve_R_R_R(instruction ins,
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
assert(insOptsScalableFloat(opt));
assert(isValidVectorElemsizeSveFloat(size));
assert(isScalableVectorSize(size));
assert(insScalableOptsNone(sopt));
fmt = IF_SVE_HJ_3A;
break;
Expand Down Expand Up @@ -12618,7 +12626,7 @@ void emitter::emitInsSveSanityCheck(instrDesc* id)
assert(isVectorRegister(id->idReg1())); // ddddd
assert(isLowPredicateRegister(id->idReg2())); // ggg
assert(isVectorRegister(id->idReg3())); // mmmmm
assert(isValidVectorElemsizeSveFloat(id->idOpSize()));
assert(isScalableVectorSize(id->idOpSize()));
break;

// Scalable to general register.
Expand Down Expand Up @@ -13211,11 +13219,20 @@ void emitter::emitInsSveSanityCheck(instrDesc* id)

// Scalable, widening to scalar SIMD.
case IF_SVE_AI_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer add reduction (predicated)
assert(insOptsScalableWide(id->idInsOpt())); // xx
switch (id->idIns())
{
case INS_sve_saddv:
assert(insOptsScalableWide(id->idInsOpt())); // xx
break;

default:
assert(insOptsScalableStandard(id->idInsOpt())); // xx
break;
}
assert(isVectorRegister(id->idReg1())); // ddddd
assert(isLowPredicateRegister(id->idReg2())); // ggg
assert(isVectorRegister(id->idReg3())); // mmmmm
assert(isValidVectorElemsizeWidening(id->idOpSize()));
assert(isScalableVectorSize(id->idOpSize()));
break;

// Scalable, possibly FP.
Expand Down
145 changes: 73 additions & 72 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,175 +70,176 @@ enum HWIntrinsicCategory : uint8_t
#else
#error Unsupported platform
#endif

enum HWIntrinsicFlag : unsigned int
{
HW_Flag_NoFlag = 0,

// Commutative
// - if a binary-op intrinsic is commutative (e.g., Add, Multiply), its op1 can be contained
HW_Flag_Commutative = 0x1,
HW_Flag_Commutative = (1 << 0),
a74nh marked this conversation as resolved.
Show resolved Hide resolved

// NoCodeGen
// - should be transformed in the compiler front-end, cannot reach CodeGen
HW_Flag_NoCodeGen = 0x2,
HW_Flag_NoCodeGen = (1 << 1),

// Multi-instruction
// - that one intrinsic can generate multiple instructions
HW_Flag_MultiIns = 0x4,
HW_Flag_MultiIns = (1 << 2),

// Select base type using the first argument type
HW_Flag_BaseTypeFromFirstArg = 0x8,
HW_Flag_BaseTypeFromFirstArg = (1 << 3),

// Select base type using the second argument type
HW_Flag_BaseTypeFromSecondArg = 0x10,
HW_Flag_BaseTypeFromSecondArg = (1 << 4),

// Indicates compFloatingPointUsed does not need to be set.
HW_Flag_NoFloatingPointUsed = 0x20,
HW_Flag_NoFloatingPointUsed = (1 << 5),

// NoJmpTable IMM
// the imm intrinsic does not need jumptable fallback when it gets non-const argument
HW_Flag_NoJmpTableIMM = 0x40,
HW_Flag_NoJmpTableIMM = (1 << 6),

// Special codegen
// the intrinsics need special rules in CodeGen,
// but may be table-driven in the front-end
HW_Flag_SpecialCodeGen = 0x80,
HW_Flag_SpecialCodeGen = (1 << 7),

// Special import
// the intrinsics need special rules in importer,
// but may be table-driven in the back-end
HW_Flag_SpecialImport = 0x100,
HW_Flag_SpecialImport = (1 << 8),

// The intrinsic returns result in multiple registers.
HW_Flag_MultiReg = 0x200,
HW_Flag_MultiReg = (1 << 9),

// The intrinsic has some barrier special side effect that should be tracked
HW_Flag_SpecialSideEffect_Barrier = (1 << 10),

// The intrinsic has some other special side effect that should be tracked
HW_Flag_SpecialSideEffect_Other = (1 << 11),

HW_Flag_SpecialSideEffectMask = (HW_Flag_SpecialSideEffect_Barrier | HW_Flag_SpecialSideEffect_Other),

// The below is for defining platform-specific flags
// MaybeNoJmpTable IMM
// the imm intrinsic may not need jumptable fallback when it gets non-const argument
HW_Flag_MaybeNoJmpTableIMM = (1 << 12),

HW_Flag_CanBenefitFromConstantProp = (1 << 13),

// Used as a base for shifting the platform specific flags.
HW_Flag_PlatformBase = 13,
#define HW_TARGET_FLAG(id) (unsigned int)(1 << (id + HW_Flag_PlatformBase))

// Platform-specific flags
#if defined(TARGET_XARCH)
// Full range IMM intrinsic
// - the immediate value is valid on the full range of imm8 (0-255)
HW_Flag_FullRangeIMM = 0x400,
HW_Flag_FullRangeIMM = HW_TARGET_FLAG(1),

// Maybe IMM
// the intrinsic has either imm or Vector overloads
HW_Flag_MaybeIMM = 0x800,
HW_Flag_MaybeIMM = HW_TARGET_FLAG(2),

// Copy Upper bits
// some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand
HW_Flag_CopyUpperBits = 0x1000,
HW_Flag_CopyUpperBits = HW_TARGET_FLAG(3),

// Maybe Memory Load/Store
// - some intrinsics may have pointer overloads but without HW_Category_MemoryLoad/HW_Category_MemoryStore
HW_Flag_MaybeMemoryLoad = 0x2000,
HW_Flag_MaybeMemoryStore = 0x4000,
HW_Flag_MaybeMemoryLoad = HW_TARGET_FLAG(4),
HW_Flag_MaybeMemoryStore = HW_TARGET_FLAG(5),

// No Read/Modify/Write Semantics
// the intrinsic doesn't have read/modify/write semantics in two/three-operand form.
HW_Flag_NoRMWSemantics = 0x8000,
HW_Flag_NoRMWSemantics = HW_TARGET_FLAG(6),

// NoContainment
// the intrinsic cannot be handled by containment,
// all the intrinsic that have explicit memory load/store semantics should have this flag
HW_Flag_NoContainment = 0x10000,
HW_Flag_NoContainment = HW_TARGET_FLAG(7),

// Returns Per-Element Mask
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
// this output can be used as a per-element mask
HW_Flag_ReturnsPerElementMask = 0x20000,
HW_Flag_ReturnsPerElementMask = HW_TARGET_FLAG(8),

// AvxOnlyCompatible
// the intrinsic can be used on hardware with AVX but not AVX2 support
HW_Flag_AvxOnlyCompatible = 0x40000,
HW_Flag_AvxOnlyCompatible = HW_TARGET_FLAG(9),

// MaybeCommutative
// - if a binary-op intrinsic is maybe commutative (e.g., Max or Min for float/double), its op1 can possibly be
// contained
HW_Flag_MaybeCommutative = 0x80000,
HW_Flag_MaybeCommutative = HW_TARGET_FLAG(10),

// The intrinsic has no EVEX compatible form
HW_Flag_NoEvexSemantics = 0x100000,
HW_Flag_NoEvexSemantics = HW_TARGET_FLAG(11),

// The intrinsic is an RMW intrinsic
HW_Flag_RmwIntrinsic = HW_TARGET_FLAG(12),

// The intrinsic is a FusedMultiplyAdd intrinsic
HW_Flag_FmaIntrinsic = HW_TARGET_FLAG(13),

// The intrinsic is a PermuteVar2x intrinsic
HW_Flag_PermuteVar2x = HW_TARGET_FLAG(14),

// The intrinsic is an embedded broadcast compatible intrinsic
HW_Flag_EmbBroadcastCompatible = HW_TARGET_FLAG(15),

// The intrinsic is an embedded rounding compatible intrinsic
HW_Flag_EmbRoundingCompatible = HW_TARGET_FLAG(16),

// The intrinsic is an embedded masking incompatible intrinsic
HW_Flag_EmbMaskingIncompatible = HW_TARGET_FLAG(17),

#elif defined(TARGET_ARM64)
// The intrinsic has an immediate operand
// - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant
HW_Flag_HasImmediateOperand = 0x400,
HW_Flag_HasImmediateOperand = HW_TARGET_FLAG(1),

// The intrinsic has read/modify/write semantics in multiple-operands form.
HW_Flag_HasRMWSemantics = 0x800,
HW_Flag_HasRMWSemantics = HW_TARGET_FLAG(2),

// The intrinsic operates on the lower part of a SIMD register
// - the upper part of the source registers are ignored
// - the upper part of the destination register is zeroed
HW_Flag_SIMDScalar = 0x1000,
HW_Flag_SIMDScalar = HW_TARGET_FLAG(3),

// The intrinsic supports some sort of containment analysis
HW_Flag_SupportsContainment = 0x2000,
HW_Flag_SupportsContainment = HW_TARGET_FLAG(4),

// The intrinsic needs consecutive registers
HW_Flag_NeedsConsecutiveRegisters = 0x4000,
HW_Flag_NeedsConsecutiveRegisters = HW_TARGET_FLAG(5),

// The intrinsic uses scalable registers
HW_Flag_Scalable = 0x8000,
HW_Flag_Scalable = HW_TARGET_FLAG(6),

// Returns Per-Element Mask
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
// this output can be used as a per-element mask
HW_Flag_ReturnsPerElementMask = 0x10000,
HW_Flag_ReturnsPerElementMask = HW_TARGET_FLAG(7),

// The intrinsic uses a mask in arg1 to select elements present in the result
HW_Flag_ExplicitMaskedOperation = 0x20000,
HW_Flag_ExplicitMaskedOperation = HW_TARGET_FLAG(8),

// The intrinsic uses a mask in arg1 to select elements present in the result, and must use a low register.
HW_Flag_LowMaskedOperation = 0x40000,
HW_Flag_LowMaskedOperation = HW_TARGET_FLAG(9),

// The intrinsic can optionally use a mask in arg1 to select elements present in the result, which is not present in
// the API call
HW_Flag_OptionalEmbeddedMaskedOperation = 0x80000,
HW_Flag_OptionalEmbeddedMaskedOperation = HW_TARGET_FLAG(10),

// The intrinsic uses a mask in arg1 to select elements present in the result, which is not present in the API call
HW_Flag_EmbeddedMaskedOperation = 0x100000,
HW_Flag_EmbeddedMaskedOperation = HW_TARGET_FLAG(11),

// The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand.
HW_Flag_HasEnumOperand = HW_TARGET_FLAG(12),

#else
#error Unsupported platform
#endif

// The intrinsic has some barrier special side effect that should be tracked
HW_Flag_SpecialSideEffect_Barrier = 0x200000,

// The intrinsic has some other special side effect that should be tracked
HW_Flag_SpecialSideEffect_Other = 0x400000,

HW_Flag_SpecialSideEffectMask = (HW_Flag_SpecialSideEffect_Barrier | HW_Flag_SpecialSideEffect_Other),

// MaybeNoJmpTable IMM
// the imm intrinsic may not need jumptable fallback when it gets non-const argument
HW_Flag_MaybeNoJmpTableIMM = 0x800000,

#if defined(TARGET_XARCH)
// The intrinsic is an RMW intrinsic
HW_Flag_RmwIntrinsic = 0x1000000,

// The intrinsic is a FusedMultiplyAdd intrinsic
HW_Flag_FmaIntrinsic = 0x2000000,

// The intrinsic is a PermuteVar2x intrinsic
HW_Flag_PermuteVar2x = 0x4000000,

// The intrinsic is an embedded broadcast compatible intrinsic
HW_Flag_EmbBroadcastCompatible = 0x8000000,

// The intrinsic is an embedded rounding compatible intrinsic
HW_Flag_EmbRoundingCompatible = 0x10000000,

// The intrinsic is an embedded masking incompatible intrinsic
HW_Flag_EmbMaskingIncompatible = 0x20000000,
#elif defined(TARGET_ARM64)

// The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand.
HW_Flag_HasEnumOperand = 0x1000000,

#endif // TARGET_XARCH

HW_Flag_CanBenefitFromConstantProp = 0x80000000,
};

#if defined(TARGET_XARCH)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// Sve
HARDWARE_INTRINSIC(Sve, Abs, -1, -1, false, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, Add, -1, -1, false, {INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_fadd, INS_sve_fadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, AddAcross, -1, 1, true, {INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_faddv, INS_sve_faddv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, true, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment)
HARDWARE_INTRINSIC(Sve, Count16BitElements, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_cnth, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasEnumOperand|HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed)
HARDWARE_INTRINSIC(Sve, Count32BitElements, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_cntw, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasEnumOperand|HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed)
Expand Down