Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zstd: Shorter and faster asm for decSymbol.newState #896

Merged
merged 2 commits into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
39 changes: 12 additions & 27 deletions zstd/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,30 +316,30 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
lowBits := GP64()
BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1))
SHRXQ(nBits, bits, bits) // bits >>= nBits
o.nextState(name+"_ofState", ofState, lowBits, "ofTable")
o.nextState(ofState, lowBits, "ofTable")
}
Comment("Update Match Length State")
{
nBits := mlState
lowBits := GP64()
BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1))
SHRXQ(nBits, bits, bits) // lowBits >>= nBits
o.nextState(name+"_mlState", mlState, lowBits, "mlTable")
o.nextState(mlState, lowBits, "mlTable")
}
Comment("Update Literal Length State")
{
nBits := llState
lowBits := GP64()
BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1))
o.nextState(name+"_llState", llState, lowBits, "llTable")
o.nextState(llState, lowBits, "llTable")
}
} else {
Comment("Update Literal Length State")
o.updateState(name+"_llState", llState, brValue, brBitsRead, "llTable")
o.updateState(llState, brValue, brBitsRead, "llTable")
Comment("Update Match Length State")
o.updateState(name+"_mlState", mlState, brValue, brBitsRead, "mlTable")
o.updateState(mlState, brValue, brBitsRead, "mlTable")
Comment("Update Offset State")
o.updateState(name+"_ofState", ofState, brValue, brBitsRead, "ofTable")
o.updateState(ofState, brValue, brBitsRead, "ofTable")
}
}
Label(name + "_skip_update")
Expand Down Expand Up @@ -631,8 +631,7 @@ func (o options) updateLength(name string, brValue, brBitsRead, state reg.GPVirt
}
}

func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtual, table string) {
name = name + "_updateState"
func (o options) updateState(state, brValue, brBitsRead reg.GPVirtual, table string) {
AX := GP64()
MOVBQZX(state.As8(), AX) // AX = nBits
// Check we have a reasonable nBits
Expand All @@ -642,15 +641,8 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
})

DX := GP64()
if o.bmi2 {
tmp := GP64()
MOVQ(U32(16|(16<<8)), tmp)
BEXTRQ(tmp, state, DX)
} else {
MOVQ(state, DX)
SHRQ(U8(16), DX)
MOVWQZX(DX.As16(), DX)
}
MOVL(state.As32(), DX.As32()) // Clear the top 32 bits.
SHRL(U8(16), DX.As32())

{
lowBits := o.getBits(AX, brValue, brBitsRead)
Expand Down Expand Up @@ -681,17 +673,10 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu
MOVQ(Mem{Base: tablePtr, Index: DX, Scale: 8}, state)
}

func (o options) nextState(name string, state, lowBits reg.GPVirtual, table string) {
func (o options) nextState(state, lowBits reg.GPVirtual, table string) {
DX := GP64()
if o.bmi2 {
tmp := GP64()
MOVQ(U32(16|(16<<8)), tmp)
BEXTRQ(tmp, state, DX)
} else {
MOVQ(state, DX)
SHRQ(U8(16), DX)
MOVWQZX(DX.As16(), DX)
}
MOVL(state.As32(), DX.As32()) // Clear the top 32 bits.
SHRL(U8(16), DX.As32())

ADDQ(lowBits, DX)

Expand Down
136 changes: 56 additions & 80 deletions zstd/seqdec_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ sequenceDecs_decode_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -177,8 +176,7 @@ sequenceDecs_decode_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -197,8 +195,7 @@ sequenceDecs_decode_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand Down Expand Up @@ -459,8 +456,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -479,8 +475,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand All @@ -499,8 +494,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R14*1), CX
MOVQ DX, R15
MOVQ CX, BX
Expand Down Expand Up @@ -772,34 +766,31 @@ sequenceDecs_decode_bmi2_fill_2_end:
BZHIQ R14, R15, R15

// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
BZHIQ SI, R15, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -1032,34 +1023,31 @@ sequenceDecs_decode_56_bmi2_fill_end:
BZHIQ R14, R15, R15

// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
BZHIQ SI, R15, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -1967,8 +1955,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -1987,8 +1974,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -2007,8 +1993,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand Down Expand Up @@ -2514,34 +2499,31 @@ sequenceDecs_decodeSync_bmi2_fill_2_end:
BZHIQ R13, R14, R14

// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
BZHIQ SI, R14, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down Expand Up @@ -3055,8 +3037,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero:

// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
SHRL $0x10, DI
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -3075,8 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero:

// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
SHRL $0x10, R8
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand All @@ -3095,8 +3075,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero:

// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
SHRL $0x10, R9
LEAQ (BX)(R13*1), CX
MOVQ DX, R14
MOVQ CX, BX
Expand Down Expand Up @@ -3704,34 +3683,31 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
BZHIQ R13, R14, R14

// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
SHRL $0x10, R8
ADDQ CX, R8

// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8

// Update Match Length State
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
SHRL $0x10, DI
ADDQ CX, DI

// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI

// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
BZHIQ SI, R14, CX
SHRL $0x10, SI
ADDQ CX, SI

// Load ctx.llTable
MOVQ ctx+16(FP), CX
Expand Down