| // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. |
| |
| //go:build !appengine && !noasm && gc && !noasm |
| |
| // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
| // Requires: CMOV |
| TEXT ·sequenceDecs_decode_amd64(SB), $8-32 |
| MOVQ br+8(FP), CX |
| MOVQ 24(CX), DX |
| MOVBQZX 32(CX), BX |
| MOVQ (CX), AX |
| MOVQ 8(CX), SI |
| ADDQ SI, AX |
| MOVQ AX, (SP) |
| MOVQ ctx+16(FP), AX |
| MOVQ 72(AX), DI |
| MOVQ 80(AX), R8 |
| MOVQ 88(AX), R9 |
| MOVQ 104(AX), R10 |
| MOVQ s+0(FP), AX |
| MOVQ 144(AX), R11 |
| MOVQ 152(AX), R12 |
| MOVQ 160(AX), R13 |
| |
| sequenceDecs_decode_amd64_main_loop: |
| MOVQ (SP), R14 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decode_amd64_fill_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R14 |
| MOVQ (R14), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decode_amd64_fill_end |
| |
| sequenceDecs_decode_amd64_fill_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decode_amd64_fill_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decode_amd64_fill_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R14 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R14), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decode_amd64_fill_byte_by_byte |
| |
| sequenceDecs_decode_amd64_fill_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decode_amd64_fill_end: |
| // Update offset |
| MOVQ R9, AX |
| MOVQ BX, CX |
| MOVQ DX, R15 |
| SHLQ CL, R15 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decode_amd64_of_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decode_amd64_of_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decode_amd64_of_update_zero |
| NEGQ CX |
| SHRQ CL, R15 |
| ADDQ R15, AX |
| |
| sequenceDecs_decode_amd64_of_update_zero: |
| MOVQ AX, 16(R10) |
| |
| // Update match length |
| MOVQ R8, AX |
| MOVQ BX, CX |
| MOVQ DX, R15 |
| SHLQ CL, R15 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decode_amd64_ml_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decode_amd64_ml_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decode_amd64_ml_update_zero |
| NEGQ CX |
| SHRQ CL, R15 |
| ADDQ R15, AX |
| |
| sequenceDecs_decode_amd64_ml_update_zero: |
| MOVQ AX, 8(R10) |
| |
| // Fill bitreader to have enough for the remaining |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decode_amd64_fill_2_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R14 |
| MOVQ (R14), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decode_amd64_fill_2_end |
| |
| sequenceDecs_decode_amd64_fill_2_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decode_amd64_fill_2_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decode_amd64_fill_2_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R14 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R14), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte |
| |
| sequenceDecs_decode_amd64_fill_2_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decode_amd64_fill_2_end: |
| // Update literal length |
| MOVQ DI, AX |
| MOVQ BX, CX |
| MOVQ DX, R15 |
| SHLQ CL, R15 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decode_amd64_ll_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decode_amd64_ll_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decode_amd64_ll_update_zero |
| NEGQ CX |
| SHRQ CL, R15 |
| ADDQ R15, AX |
| |
| sequenceDecs_decode_amd64_ll_update_zero: |
| MOVQ AX, (R10) |
| |
| // Fill bitreader for state updates |
| MOVQ R14, (SP) |
| MOVQ R9, AX |
| SHRQ $0x08, AX |
| MOVBQZX AL, AX |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decode_amd64_skip_update |
| |
| // Update Literal Length State |
| MOVBQZX DI, R14 |
| SHRL $0x10, DI |
| LEAQ (BX)(R14*1), CX |
| MOVQ DX, R15 |
| MOVQ CX, BX |
| ROLQ CL, R15 |
| MOVL $0x00000001, BP |
| MOVB R14, CL |
| SHLL CL, BP |
| DECL BP |
| ANDQ BP, R15 |
| ADDQ R15, DI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Match Length State |
| MOVBQZX R8, R14 |
| SHRL $0x10, R8 |
| LEAQ (BX)(R14*1), CX |
| MOVQ DX, R15 |
| MOVQ CX, BX |
| ROLQ CL, R15 |
| MOVL $0x00000001, BP |
| MOVB R14, CL |
| SHLL CL, BP |
| DECL BP |
| ANDQ BP, R15 |
| ADDQ R15, R8 |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Offset State |
| MOVBQZX R9, R14 |
| SHRL $0x10, R9 |
| LEAQ (BX)(R14*1), CX |
| MOVQ DX, R15 |
| MOVQ CX, BX |
| ROLQ CL, R15 |
| MOVL $0x00000001, BP |
| MOVB R14, CL |
| SHLL CL, BP |
| DECL BP |
| ANDQ BP, R15 |
| ADDQ R15, R9 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R9*8), R9 |
| |
| sequenceDecs_decode_amd64_skip_update: |
| // Adjust offset |
| MOVQ 16(R10), CX |
| CMPQ AX, $0x01 |
| JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 |
| MOVQ R12, R13 |
| MOVQ R11, R12 |
| MOVQ CX, R11 |
| JMP sequenceDecs_decode_amd64_after_adjust |
| |
| sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: |
| CMPQ (R10), $0x00000000 |
| JNE sequenceDecs_decode_amd64_adjust_offset_maybezero |
| INCQ CX |
| JMP sequenceDecs_decode_amd64_adjust_offset_nonzero |
| |
| sequenceDecs_decode_amd64_adjust_offset_maybezero: |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero |
| MOVQ R11, CX |
| JMP sequenceDecs_decode_amd64_after_adjust |
| |
| sequenceDecs_decode_amd64_adjust_offset_nonzero: |
| CMPQ CX, $0x01 |
| JB sequenceDecs_decode_amd64_adjust_zero |
| JEQ sequenceDecs_decode_amd64_adjust_one |
| CMPQ CX, $0x02 |
| JA sequenceDecs_decode_amd64_adjust_three |
| JMP sequenceDecs_decode_amd64_adjust_two |
| |
| sequenceDecs_decode_amd64_adjust_zero: |
| MOVQ R11, AX |
| JMP sequenceDecs_decode_amd64_adjust_test_temp_valid |
| |
| sequenceDecs_decode_amd64_adjust_one: |
| MOVQ R12, AX |
| JMP sequenceDecs_decode_amd64_adjust_test_temp_valid |
| |
| sequenceDecs_decode_amd64_adjust_two: |
| MOVQ R13, AX |
| JMP sequenceDecs_decode_amd64_adjust_test_temp_valid |
| |
| sequenceDecs_decode_amd64_adjust_three: |
| LEAQ -1(R11), AX |
| |
| sequenceDecs_decode_amd64_adjust_test_temp_valid: |
| TESTQ AX, AX |
| JNZ sequenceDecs_decode_amd64_adjust_temp_valid |
| MOVQ $0x00000001, AX |
| |
| sequenceDecs_decode_amd64_adjust_temp_valid: |
| CMPQ CX, $0x01 |
| CMOVQNE R12, R13 |
| MOVQ R11, R12 |
| MOVQ AX, R11 |
| MOVQ AX, CX |
| |
| sequenceDecs_decode_amd64_after_adjust: |
| MOVQ CX, 16(R10) |
| |
| // Check values |
| MOVQ 8(R10), AX |
| MOVQ (R10), R14 |
| LEAQ (AX)(R14*1), R15 |
| MOVQ s+0(FP), BP |
| ADDQ R15, 256(BP) |
| MOVQ ctx+16(FP), R15 |
| SUBQ R14, 128(R15) |
| JS error_not_enough_literals |
| CMPQ AX, $0x00020002 |
| JA sequenceDecs_decode_amd64_error_match_len_too_big |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_amd64_match_len_ofs_ok |
| TESTQ AX, AX |
| JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decode_amd64_match_len_ofs_ok: |
| ADDQ $0x18, R10 |
| MOVQ ctx+16(FP), AX |
| DECQ 96(AX) |
| JNS sequenceDecs_decode_amd64_main_loop |
| MOVQ s+0(FP), AX |
| MOVQ R11, 144(AX) |
| MOVQ R12, 152(AX) |
| MOVQ R13, 160(AX) |
| MOVQ br+8(FP), AX |
| MOVQ DX, 24(AX) |
| MOVB BL, 32(AX) |
| MOVQ SI, 8(AX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decode_amd64_error_match_len_too_big: |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
| // Requires: CMOV |
| TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 |
| MOVQ br+8(FP), CX |
| MOVQ 24(CX), DX |
| MOVBQZX 32(CX), BX |
| MOVQ (CX), AX |
| MOVQ 8(CX), SI |
| ADDQ SI, AX |
| MOVQ AX, (SP) |
| MOVQ ctx+16(FP), AX |
| MOVQ 72(AX), DI |
| MOVQ 80(AX), R8 |
| MOVQ 88(AX), R9 |
| MOVQ 104(AX), R10 |
| MOVQ s+0(FP), AX |
| MOVQ 144(AX), R11 |
| MOVQ 152(AX), R12 |
| MOVQ 160(AX), R13 |
| |
| sequenceDecs_decode_56_amd64_main_loop: |
| MOVQ (SP), R14 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decode_56_amd64_fill_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R14 |
| MOVQ (R14), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decode_56_amd64_fill_end |
| |
| sequenceDecs_decode_56_amd64_fill_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decode_56_amd64_fill_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decode_56_amd64_fill_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R14 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R14), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte |
| |
| sequenceDecs_decode_56_amd64_fill_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decode_56_amd64_fill_end: |
| // Update offset |
| MOVQ R9, AX |
| MOVQ BX, CX |
| MOVQ DX, R15 |
| SHLQ CL, R15 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decode_56_amd64_of_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decode_56_amd64_of_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decode_56_amd64_of_update_zero |
| NEGQ CX |
| SHRQ CL, R15 |
| ADDQ R15, AX |
| |
| sequenceDecs_decode_56_amd64_of_update_zero: |
| MOVQ AX, 16(R10) |
| |
| // Update match length |
| MOVQ R8, AX |
| MOVQ BX, CX |
| MOVQ DX, R15 |
| SHLQ CL, R15 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decode_56_amd64_ml_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decode_56_amd64_ml_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decode_56_amd64_ml_update_zero |
| NEGQ CX |
| SHRQ CL, R15 |
| ADDQ R15, AX |
| |
| sequenceDecs_decode_56_amd64_ml_update_zero: |
| MOVQ AX, 8(R10) |
| |
| // Update literal length |
| MOVQ DI, AX |
| MOVQ BX, CX |
| MOVQ DX, R15 |
| SHLQ CL, R15 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decode_56_amd64_ll_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decode_56_amd64_ll_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decode_56_amd64_ll_update_zero |
| NEGQ CX |
| SHRQ CL, R15 |
| ADDQ R15, AX |
| |
| sequenceDecs_decode_56_amd64_ll_update_zero: |
| MOVQ AX, (R10) |
| |
| // Fill bitreader for state updates |
| MOVQ R14, (SP) |
| MOVQ R9, AX |
| SHRQ $0x08, AX |
| MOVBQZX AL, AX |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decode_56_amd64_skip_update |
| |
| // Update Literal Length State |
| MOVBQZX DI, R14 |
| SHRL $0x10, DI |
| LEAQ (BX)(R14*1), CX |
| MOVQ DX, R15 |
| MOVQ CX, BX |
| ROLQ CL, R15 |
| MOVL $0x00000001, BP |
| MOVB R14, CL |
| SHLL CL, BP |
| DECL BP |
| ANDQ BP, R15 |
| ADDQ R15, DI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Match Length State |
| MOVBQZX R8, R14 |
| SHRL $0x10, R8 |
| LEAQ (BX)(R14*1), CX |
| MOVQ DX, R15 |
| MOVQ CX, BX |
| ROLQ CL, R15 |
| MOVL $0x00000001, BP |
| MOVB R14, CL |
| SHLL CL, BP |
| DECL BP |
| ANDQ BP, R15 |
| ADDQ R15, R8 |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Offset State |
| MOVBQZX R9, R14 |
| SHRL $0x10, R9 |
| LEAQ (BX)(R14*1), CX |
| MOVQ DX, R15 |
| MOVQ CX, BX |
| ROLQ CL, R15 |
| MOVL $0x00000001, BP |
| MOVB R14, CL |
| SHLL CL, BP |
| DECL BP |
| ANDQ BP, R15 |
| ADDQ R15, R9 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R9*8), R9 |
| |
| sequenceDecs_decode_56_amd64_skip_update: |
| // Adjust offset |
| MOVQ 16(R10), CX |
| CMPQ AX, $0x01 |
| JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 |
| MOVQ R12, R13 |
| MOVQ R11, R12 |
| MOVQ CX, R11 |
| JMP sequenceDecs_decode_56_amd64_after_adjust |
| |
| sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: |
| CMPQ (R10), $0x00000000 |
| JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero |
| INCQ CX |
| JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero |
| |
| sequenceDecs_decode_56_amd64_adjust_offset_maybezero: |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero |
| MOVQ R11, CX |
| JMP sequenceDecs_decode_56_amd64_after_adjust |
| |
| sequenceDecs_decode_56_amd64_adjust_offset_nonzero: |
| CMPQ CX, $0x01 |
| JB sequenceDecs_decode_56_amd64_adjust_zero |
| JEQ sequenceDecs_decode_56_amd64_adjust_one |
| CMPQ CX, $0x02 |
| JA sequenceDecs_decode_56_amd64_adjust_three |
| JMP sequenceDecs_decode_56_amd64_adjust_two |
| |
| sequenceDecs_decode_56_amd64_adjust_zero: |
| MOVQ R11, AX |
| JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid |
| |
| sequenceDecs_decode_56_amd64_adjust_one: |
| MOVQ R12, AX |
| JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid |
| |
| sequenceDecs_decode_56_amd64_adjust_two: |
| MOVQ R13, AX |
| JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid |
| |
| sequenceDecs_decode_56_amd64_adjust_three: |
| LEAQ -1(R11), AX |
| |
| sequenceDecs_decode_56_amd64_adjust_test_temp_valid: |
| TESTQ AX, AX |
| JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid |
| MOVQ $0x00000001, AX |
| |
| sequenceDecs_decode_56_amd64_adjust_temp_valid: |
| CMPQ CX, $0x01 |
| CMOVQNE R12, R13 |
| MOVQ R11, R12 |
| MOVQ AX, R11 |
| MOVQ AX, CX |
| |
| sequenceDecs_decode_56_amd64_after_adjust: |
| MOVQ CX, 16(R10) |
| |
| // Check values |
| MOVQ 8(R10), AX |
| MOVQ (R10), R14 |
| LEAQ (AX)(R14*1), R15 |
| MOVQ s+0(FP), BP |
| ADDQ R15, 256(BP) |
| MOVQ ctx+16(FP), R15 |
| SUBQ R14, 128(R15) |
| JS error_not_enough_literals |
| CMPQ AX, $0x00020002 |
| JA sequenceDecs_decode_56_amd64_error_match_len_too_big |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok |
| TESTQ AX, AX |
| JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decode_56_amd64_match_len_ofs_ok: |
| ADDQ $0x18, R10 |
| MOVQ ctx+16(FP), AX |
| DECQ 96(AX) |
| JNS sequenceDecs_decode_56_amd64_main_loop |
| MOVQ s+0(FP), AX |
| MOVQ R11, 144(AX) |
| MOVQ R12, 152(AX) |
| MOVQ R13, 160(AX) |
| MOVQ br+8(FP), AX |
| MOVQ DX, 24(AX) |
| MOVB BL, 32(AX) |
| MOVQ SI, 8(AX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decode_56_amd64_error_match_len_too_big: |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
| // Requires: BMI, BMI2, CMOV |
| TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 |
| MOVQ br+8(FP), BX |
| MOVQ 24(BX), AX |
| MOVBQZX 32(BX), DX |
| MOVQ (BX), CX |
| MOVQ 8(BX), BX |
| ADDQ BX, CX |
| MOVQ CX, (SP) |
| MOVQ ctx+16(FP), CX |
| MOVQ 72(CX), SI |
| MOVQ 80(CX), DI |
| MOVQ 88(CX), R8 |
| MOVQ 104(CX), R9 |
| MOVQ s+0(FP), CX |
| MOVQ 144(CX), R10 |
| MOVQ 152(CX), R11 |
| MOVQ 160(CX), R12 |
| |
| sequenceDecs_decode_bmi2_main_loop: |
| MOVQ (SP), R13 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decode_bmi2_fill_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R13 |
| MOVQ (R13), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decode_bmi2_fill_end |
| |
| sequenceDecs_decode_bmi2_fill_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decode_bmi2_fill_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decode_bmi2_fill_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R13), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decode_bmi2_fill_byte_by_byte |
| |
| sequenceDecs_decode_bmi2_fill_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decode_bmi2_fill_end: |
| // Update offset |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R14 |
| MOVQ AX, R15 |
| LEAQ (DX)(R14*1), CX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| MOVQ CX, DX |
| MOVQ R8, CX |
| SHRQ $0x20, CX |
| ADDQ R15, CX |
| MOVQ CX, 16(R9) |
| |
| // Update match length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, DI, R14 |
| MOVQ AX, R15 |
| LEAQ (DX)(R14*1), CX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| MOVQ CX, DX |
| MOVQ DI, CX |
| SHRQ $0x20, CX |
| ADDQ R15, CX |
| MOVQ CX, 8(R9) |
| |
| // Fill bitreader to have enough for the remaining |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R13 |
| MOVQ (R13), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decode_bmi2_fill_2_end |
| |
| sequenceDecs_decode_bmi2_fill_2_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decode_bmi2_fill_2_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decode_bmi2_fill_2_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R13), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte |
| |
| sequenceDecs_decode_bmi2_fill_2_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decode_bmi2_fill_2_end: |
| // Update literal length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, SI, R14 |
| MOVQ AX, R15 |
| LEAQ (DX)(R14*1), CX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| MOVQ CX, DX |
| MOVQ SI, CX |
| SHRQ $0x20, CX |
| ADDQ R15, CX |
| MOVQ CX, (R9) |
| |
| // Fill bitreader for state updates |
| MOVQ R13, (SP) |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R13 |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decode_bmi2_skip_update |
| LEAQ (SI)(DI*1), R14 |
| ADDQ R8, R14 |
| MOVBQZX R14, R14 |
| LEAQ (DX)(R14*1), CX |
| MOVQ AX, R15 |
| MOVQ CX, DX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| |
| // Update Offset State |
| BZHIQ R8, R15, CX |
| SHRXQ R8, R15, R15 |
| SHRL $0x10, R8 |
| ADDQ CX, R8 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Match Length State |
| BZHIQ DI, R15, CX |
| SHRXQ DI, R15, R15 |
| SHRL $0x10, DI |
| ADDQ CX, DI |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Literal Length State |
| BZHIQ SI, R15, CX |
| SHRL $0x10, SI |
| ADDQ CX, SI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(SI*8), SI |
| |
| sequenceDecs_decode_bmi2_skip_update: |
| // Adjust offset |
| MOVQ 16(R9), CX |
| CMPQ R13, $0x01 |
| JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 |
| MOVQ R11, R12 |
| MOVQ R10, R11 |
| MOVQ CX, R10 |
| JMP sequenceDecs_decode_bmi2_after_adjust |
| |
| sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: |
| CMPQ (R9), $0x00000000 |
| JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero |
| INCQ CX |
| JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero |
| |
| sequenceDecs_decode_bmi2_adjust_offset_maybezero: |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero |
| MOVQ R10, CX |
| JMP sequenceDecs_decode_bmi2_after_adjust |
| |
| sequenceDecs_decode_bmi2_adjust_offset_nonzero: |
| CMPQ CX, $0x01 |
| JB sequenceDecs_decode_bmi2_adjust_zero |
| JEQ sequenceDecs_decode_bmi2_adjust_one |
| CMPQ CX, $0x02 |
| JA sequenceDecs_decode_bmi2_adjust_three |
| JMP sequenceDecs_decode_bmi2_adjust_two |
| |
| sequenceDecs_decode_bmi2_adjust_zero: |
| MOVQ R10, R13 |
| JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid |
| |
| sequenceDecs_decode_bmi2_adjust_one: |
| MOVQ R11, R13 |
| JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid |
| |
| sequenceDecs_decode_bmi2_adjust_two: |
| MOVQ R12, R13 |
| JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid |
| |
| sequenceDecs_decode_bmi2_adjust_three: |
| LEAQ -1(R10), R13 |
| |
| sequenceDecs_decode_bmi2_adjust_test_temp_valid: |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decode_bmi2_adjust_temp_valid |
| MOVQ $0x00000001, R13 |
| |
| sequenceDecs_decode_bmi2_adjust_temp_valid: |
| CMPQ CX, $0x01 |
| CMOVQNE R11, R12 |
| MOVQ R10, R11 |
| MOVQ R13, R10 |
| MOVQ R13, CX |
| |
| sequenceDecs_decode_bmi2_after_adjust: |
| MOVQ CX, 16(R9) |
| |
| // Check values |
| MOVQ 8(R9), R13 |
| MOVQ (R9), R14 |
| LEAQ (R13)(R14*1), R15 |
| MOVQ s+0(FP), BP |
| ADDQ R15, 256(BP) |
| MOVQ ctx+16(FP), R15 |
| SUBQ R14, 128(R15) |
| JS error_not_enough_literals |
| CMPQ R13, $0x00020002 |
| JA sequenceDecs_decode_bmi2_error_match_len_too_big |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decode_bmi2_match_len_ofs_ok: |
| ADDQ $0x18, R9 |
| MOVQ ctx+16(FP), CX |
| DECQ 96(CX) |
| JNS sequenceDecs_decode_bmi2_main_loop |
| MOVQ s+0(FP), CX |
| MOVQ R10, 144(CX) |
| MOVQ R11, 152(CX) |
| MOVQ R12, 160(CX) |
| MOVQ br+8(FP), CX |
| MOVQ AX, 24(CX) |
| MOVB DL, 32(CX) |
| MOVQ BX, 8(CX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decode_bmi2_error_match_len_too_big: |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int |
| // Requires: BMI, BMI2, CMOV |
| TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 |
| MOVQ br+8(FP), BX |
| MOVQ 24(BX), AX |
| MOVBQZX 32(BX), DX |
| MOVQ (BX), CX |
| MOVQ 8(BX), BX |
| ADDQ BX, CX |
| MOVQ CX, (SP) |
| MOVQ ctx+16(FP), CX |
| MOVQ 72(CX), SI |
| MOVQ 80(CX), DI |
| MOVQ 88(CX), R8 |
| MOVQ 104(CX), R9 |
| MOVQ s+0(FP), CX |
| MOVQ 144(CX), R10 |
| MOVQ 152(CX), R11 |
| MOVQ 160(CX), R12 |
| |
| sequenceDecs_decode_56_bmi2_main_loop: |
| MOVQ (SP), R13 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R13 |
| MOVQ (R13), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decode_56_bmi2_fill_end |
| |
| sequenceDecs_decode_56_bmi2_fill_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decode_56_bmi2_fill_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decode_56_bmi2_fill_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R13), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte |
| |
| sequenceDecs_decode_56_bmi2_fill_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decode_56_bmi2_fill_end: |
| // Update offset |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R14 |
| MOVQ AX, R15 |
| LEAQ (DX)(R14*1), CX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| MOVQ CX, DX |
| MOVQ R8, CX |
| SHRQ $0x20, CX |
| ADDQ R15, CX |
| MOVQ CX, 16(R9) |
| |
| // Update match length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, DI, R14 |
| MOVQ AX, R15 |
| LEAQ (DX)(R14*1), CX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| MOVQ CX, DX |
| MOVQ DI, CX |
| SHRQ $0x20, CX |
| ADDQ R15, CX |
| MOVQ CX, 8(R9) |
| |
| // Update literal length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, SI, R14 |
| MOVQ AX, R15 |
| LEAQ (DX)(R14*1), CX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| MOVQ CX, DX |
| MOVQ SI, CX |
| SHRQ $0x20, CX |
| ADDQ R15, CX |
| MOVQ CX, (R9) |
| |
| // Fill bitreader for state updates |
| MOVQ R13, (SP) |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R13 |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decode_56_bmi2_skip_update |
| LEAQ (SI)(DI*1), R14 |
| ADDQ R8, R14 |
| MOVBQZX R14, R14 |
| LEAQ (DX)(R14*1), CX |
| MOVQ AX, R15 |
| MOVQ CX, DX |
| ROLQ CL, R15 |
| BZHIQ R14, R15, R15 |
| |
| // Update Offset State |
| BZHIQ R8, R15, CX |
| SHRXQ R8, R15, R15 |
| SHRL $0x10, R8 |
| ADDQ CX, R8 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Match Length State |
| BZHIQ DI, R15, CX |
| SHRXQ DI, R15, R15 |
| SHRL $0x10, DI |
| ADDQ CX, DI |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Literal Length State |
| BZHIQ SI, R15, CX |
| SHRL $0x10, SI |
| ADDQ CX, SI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(SI*8), SI |
| |
| sequenceDecs_decode_56_bmi2_skip_update: |
| // Adjust offset |
| MOVQ 16(R9), CX |
| CMPQ R13, $0x01 |
| JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 |
| MOVQ R11, R12 |
| MOVQ R10, R11 |
| MOVQ CX, R10 |
| JMP sequenceDecs_decode_56_bmi2_after_adjust |
| |
| sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: |
| CMPQ (R9), $0x00000000 |
| JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero |
| INCQ CX |
| JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero |
| |
| sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero |
| MOVQ R10, CX |
| JMP sequenceDecs_decode_56_bmi2_after_adjust |
| |
| sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: |
| CMPQ CX, $0x01 |
| JB sequenceDecs_decode_56_bmi2_adjust_zero |
| JEQ sequenceDecs_decode_56_bmi2_adjust_one |
| CMPQ CX, $0x02 |
| JA sequenceDecs_decode_56_bmi2_adjust_three |
| JMP sequenceDecs_decode_56_bmi2_adjust_two |
| |
| sequenceDecs_decode_56_bmi2_adjust_zero: |
| MOVQ R10, R13 |
| JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid |
| |
| sequenceDecs_decode_56_bmi2_adjust_one: |
| MOVQ R11, R13 |
| JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid |
| |
| sequenceDecs_decode_56_bmi2_adjust_two: |
| MOVQ R12, R13 |
| JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid |
| |
| sequenceDecs_decode_56_bmi2_adjust_three: |
| LEAQ -1(R10), R13 |
| |
| sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid |
| MOVQ $0x00000001, R13 |
| |
| sequenceDecs_decode_56_bmi2_adjust_temp_valid: |
| CMPQ CX, $0x01 |
| CMOVQNE R11, R12 |
| MOVQ R10, R11 |
| MOVQ R13, R10 |
| MOVQ R13, CX |
| |
| sequenceDecs_decode_56_bmi2_after_adjust: |
| MOVQ CX, 16(R9) |
| |
| // Check values |
| MOVQ 8(R9), R13 |
| MOVQ (R9), R14 |
| LEAQ (R13)(R14*1), R15 |
| MOVQ s+0(FP), BP |
| ADDQ R15, 256(BP) |
| MOVQ ctx+16(FP), R15 |
| SUBQ R14, 128(R15) |
| JS error_not_enough_literals |
| CMPQ R13, $0x00020002 |
| JA sequenceDecs_decode_56_bmi2_error_match_len_too_big |
| TESTQ CX, CX |
| JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decode_56_bmi2_match_len_ofs_ok: |
| ADDQ $0x18, R9 |
| MOVQ ctx+16(FP), CX |
| DECQ 96(CX) |
| JNS sequenceDecs_decode_56_bmi2_main_loop |
| MOVQ s+0(FP), CX |
| MOVQ R10, 144(CX) |
| MOVQ R11, 152(CX) |
| MOVQ R12, 160(CX) |
| MOVQ br+8(FP), CX |
| MOVQ AX, 24(CX) |
| MOVB DL, 32(CX) |
| MOVQ BX, 8(CX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decode_56_bmi2_error_match_len_too_big: |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool |
| // Requires: SSE |
| TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 |
| MOVQ ctx+0(FP), R10 |
| MOVQ 8(R10), CX |
| TESTQ CX, CX |
| JZ empty_seqs |
| MOVQ (R10), AX |
| MOVQ 24(R10), DX |
| MOVQ 32(R10), BX |
| MOVQ 80(R10), SI |
| MOVQ 104(R10), DI |
| MOVQ 120(R10), R8 |
| MOVQ 56(R10), R9 |
| MOVQ 64(R10), R10 |
| ADDQ R10, R9 |
| |
| // seqsBase += 24 * seqIndex |
| LEAQ (DX)(DX*2), R11 |
| SHLQ $0x03, R11 |
| ADDQ R11, AX |
| |
| // outBase += outPosition |
| ADDQ DI, BX |
| |
| main_loop: |
| MOVQ (AX), R11 |
| MOVQ 16(AX), R12 |
| MOVQ 8(AX), R13 |
| |
| // Copy literals |
| TESTQ R11, R11 |
| JZ check_offset |
| XORQ R14, R14 |
| |
| copy_1: |
| MOVUPS (SI)(R14*1), X0 |
| MOVUPS X0, (BX)(R14*1) |
| ADDQ $0x10, R14 |
| CMPQ R14, R11 |
| JB copy_1 |
| ADDQ R11, SI |
| ADDQ R11, BX |
| ADDQ R11, DI |
| |
| // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
| check_offset: |
| LEAQ (DI)(R10*1), R11 |
| CMPQ R12, R11 |
| JG error_match_off_too_big |
| CMPQ R12, R8 |
| JG error_match_off_too_big |
| |
| // Copy match from history |
| MOVQ R12, R11 |
| SUBQ DI, R11 |
| JLS copy_match |
| MOVQ R9, R14 |
| SUBQ R11, R14 |
| CMPQ R13, R11 |
| JG copy_all_from_history |
| MOVQ R13, R11 |
| SUBQ $0x10, R11 |
| JB copy_4_small |
| |
| copy_4_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (BX) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, BX |
| SUBQ $0x10, R11 |
| JAE copy_4_loop |
| LEAQ 16(R14)(R11*1), R14 |
| LEAQ 16(BX)(R11*1), BX |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(BX) |
| JMP copy_4_end |
| |
| copy_4_small: |
| CMPQ R13, $0x03 |
| JE copy_4_move_3 |
| CMPQ R13, $0x08 |
| JB copy_4_move_4through7 |
| JMP copy_4_move_8through16 |
| |
| copy_4_move_3: |
| MOVW (R14), R11 |
| MOVB 2(R14), R12 |
| MOVW R11, (BX) |
| MOVB R12, 2(BX) |
| ADDQ R13, R14 |
| ADDQ R13, BX |
| JMP copy_4_end |
| |
| copy_4_move_4through7: |
| MOVL (R14), R11 |
| MOVL -4(R14)(R13*1), R12 |
| MOVL R11, (BX) |
| MOVL R12, -4(BX)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, BX |
| JMP copy_4_end |
| |
| copy_4_move_8through16: |
| MOVQ (R14), R11 |
| MOVQ -8(R14)(R13*1), R12 |
| MOVQ R11, (BX) |
| MOVQ R12, -8(BX)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, BX |
| |
| copy_4_end: |
| ADDQ R13, DI |
| ADDQ $0x18, AX |
| INCQ DX |
| CMPQ DX, CX |
| JB main_loop |
| JMP loop_finished |
| |
| copy_all_from_history: |
| MOVQ R11, R15 |
| SUBQ $0x10, R15 |
| JB copy_5_small |
| |
| copy_5_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (BX) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, BX |
| SUBQ $0x10, R15 |
| JAE copy_5_loop |
| LEAQ 16(R14)(R15*1), R14 |
| LEAQ 16(BX)(R15*1), BX |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(BX) |
| JMP copy_5_end |
| |
| copy_5_small: |
| CMPQ R11, $0x03 |
| JE copy_5_move_3 |
| JB copy_5_move_1or2 |
| CMPQ R11, $0x08 |
| JB copy_5_move_4through7 |
| JMP copy_5_move_8through16 |
| |
| copy_5_move_1or2: |
| MOVB (R14), R15 |
| MOVB -1(R14)(R11*1), BP |
| MOVB R15, (BX) |
| MOVB BP, -1(BX)(R11*1) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| JMP copy_5_end |
| |
| copy_5_move_3: |
| MOVW (R14), R15 |
| MOVB 2(R14), BP |
| MOVW R15, (BX) |
| MOVB BP, 2(BX) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| JMP copy_5_end |
| |
| copy_5_move_4through7: |
| MOVL (R14), R15 |
| MOVL -4(R14)(R11*1), BP |
| MOVL R15, (BX) |
| MOVL BP, -4(BX)(R11*1) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| JMP copy_5_end |
| |
| copy_5_move_8through16: |
| MOVQ (R14), R15 |
| MOVQ -8(R14)(R11*1), BP |
| MOVQ R15, (BX) |
| MOVQ BP, -8(BX)(R11*1) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| |
| copy_5_end: |
| ADDQ R11, DI |
| SUBQ R11, R13 |
| |
| // Copy match from the current buffer |
| copy_match: |
| MOVQ BX, R11 |
| SUBQ R12, R11 |
| |
| // ml <= mo |
| CMPQ R13, R12 |
| JA copy_overlapping_match |
| |
| // Copy non-overlapping match |
| ADDQ R13, DI |
| MOVQ BX, R12 |
| ADDQ R13, BX |
| |
| copy_2: |
| MOVUPS (R11), X0 |
| MOVUPS X0, (R12) |
| ADDQ $0x10, R11 |
| ADDQ $0x10, R12 |
| SUBQ $0x10, R13 |
| JHI copy_2 |
| JMP handle_loop |
| |
| // Copy overlapping match |
| copy_overlapping_match: |
| ADDQ R13, DI |
| |
| copy_slow_3: |
| MOVB (R11), R12 |
| MOVB R12, (BX) |
| INCQ R11 |
| INCQ BX |
| DECQ R13 |
| JNZ copy_slow_3 |
| |
| handle_loop: |
| ADDQ $0x18, AX |
| INCQ DX |
| CMPQ DX, CX |
| JB main_loop |
| |
| loop_finished: |
| // Return value |
| MOVB $0x01, ret+8(FP) |
| |
| // Update the context |
| MOVQ ctx+0(FP), AX |
| MOVQ DX, 24(AX) |
| MOVQ DI, 104(AX) |
| SUBQ 80(AX), SI |
| MOVQ SI, 112(AX) |
| RET |
| |
| error_match_off_too_big: |
| // Return value |
| MOVB $0x00, ret+8(FP) |
| |
| // Update the context |
| MOVQ ctx+0(FP), AX |
| MOVQ DX, 24(AX) |
| MOVQ DI, 104(AX) |
| SUBQ 80(AX), SI |
| MOVQ SI, 112(AX) |
| RET |
| |
| empty_seqs: |
| // Return value |
| MOVB $0x01, ret+8(FP) |
| RET |
| |
| // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool |
| // Requires: SSE |
| TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 |
| MOVQ ctx+0(FP), R10 |
| MOVQ 8(R10), CX |
| TESTQ CX, CX |
| JZ empty_seqs |
| MOVQ (R10), AX |
| MOVQ 24(R10), DX |
| MOVQ 32(R10), BX |
| MOVQ 80(R10), SI |
| MOVQ 104(R10), DI |
| MOVQ 120(R10), R8 |
| MOVQ 56(R10), R9 |
| MOVQ 64(R10), R10 |
| ADDQ R10, R9 |
| |
| // seqsBase += 24 * seqIndex |
| LEAQ (DX)(DX*2), R11 |
| SHLQ $0x03, R11 |
| ADDQ R11, AX |
| |
| // outBase += outPosition |
| ADDQ DI, BX |
| |
| main_loop: |
| MOVQ (AX), R11 |
| MOVQ 16(AX), R12 |
| MOVQ 8(AX), R13 |
| |
| // Copy literals |
| TESTQ R11, R11 |
| JZ check_offset |
| MOVQ R11, R14 |
| SUBQ $0x10, R14 |
| JB copy_1_small |
| |
| copy_1_loop: |
| MOVUPS (SI), X0 |
| MOVUPS X0, (BX) |
| ADDQ $0x10, SI |
| ADDQ $0x10, BX |
| SUBQ $0x10, R14 |
| JAE copy_1_loop |
| LEAQ 16(SI)(R14*1), SI |
| LEAQ 16(BX)(R14*1), BX |
| MOVUPS -16(SI), X0 |
| MOVUPS X0, -16(BX) |
| JMP copy_1_end |
| |
| copy_1_small: |
| CMPQ R11, $0x03 |
| JE copy_1_move_3 |
| JB copy_1_move_1or2 |
| CMPQ R11, $0x08 |
| JB copy_1_move_4through7 |
| JMP copy_1_move_8through16 |
| |
| copy_1_move_1or2: |
| MOVB (SI), R14 |
| MOVB -1(SI)(R11*1), R15 |
| MOVB R14, (BX) |
| MOVB R15, -1(BX)(R11*1) |
| ADDQ R11, SI |
| ADDQ R11, BX |
| JMP copy_1_end |
| |
| copy_1_move_3: |
| MOVW (SI), R14 |
| MOVB 2(SI), R15 |
| MOVW R14, (BX) |
| MOVB R15, 2(BX) |
| ADDQ R11, SI |
| ADDQ R11, BX |
| JMP copy_1_end |
| |
| copy_1_move_4through7: |
| MOVL (SI), R14 |
| MOVL -4(SI)(R11*1), R15 |
| MOVL R14, (BX) |
| MOVL R15, -4(BX)(R11*1) |
| ADDQ R11, SI |
| ADDQ R11, BX |
| JMP copy_1_end |
| |
| copy_1_move_8through16: |
| MOVQ (SI), R14 |
| MOVQ -8(SI)(R11*1), R15 |
| MOVQ R14, (BX) |
| MOVQ R15, -8(BX)(R11*1) |
| ADDQ R11, SI |
| ADDQ R11, BX |
| |
| copy_1_end: |
| ADDQ R11, DI |
| |
| // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
| check_offset: |
| LEAQ (DI)(R10*1), R11 |
| CMPQ R12, R11 |
| JG error_match_off_too_big |
| CMPQ R12, R8 |
| JG error_match_off_too_big |
| |
| // Copy match from history |
| MOVQ R12, R11 |
| SUBQ DI, R11 |
| JLS copy_match |
| MOVQ R9, R14 |
| SUBQ R11, R14 |
| CMPQ R13, R11 |
| JG copy_all_from_history |
| MOVQ R13, R11 |
| SUBQ $0x10, R11 |
| JB copy_4_small |
| |
| copy_4_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (BX) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, BX |
| SUBQ $0x10, R11 |
| JAE copy_4_loop |
| LEAQ 16(R14)(R11*1), R14 |
| LEAQ 16(BX)(R11*1), BX |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(BX) |
| JMP copy_4_end |
| |
| copy_4_small: |
| CMPQ R13, $0x03 |
| JE copy_4_move_3 |
| CMPQ R13, $0x08 |
| JB copy_4_move_4through7 |
| JMP copy_4_move_8through16 |
| |
| copy_4_move_3: |
| MOVW (R14), R11 |
| MOVB 2(R14), R12 |
| MOVW R11, (BX) |
| MOVB R12, 2(BX) |
| ADDQ R13, R14 |
| ADDQ R13, BX |
| JMP copy_4_end |
| |
| copy_4_move_4through7: |
| MOVL (R14), R11 |
| MOVL -4(R14)(R13*1), R12 |
| MOVL R11, (BX) |
| MOVL R12, -4(BX)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, BX |
| JMP copy_4_end |
| |
| copy_4_move_8through16: |
| MOVQ (R14), R11 |
| MOVQ -8(R14)(R13*1), R12 |
| MOVQ R11, (BX) |
| MOVQ R12, -8(BX)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, BX |
| |
| copy_4_end: |
| ADDQ R13, DI |
| ADDQ $0x18, AX |
| INCQ DX |
| CMPQ DX, CX |
| JB main_loop |
| JMP loop_finished |
| |
| copy_all_from_history: |
| MOVQ R11, R15 |
| SUBQ $0x10, R15 |
| JB copy_5_small |
| |
| copy_5_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (BX) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, BX |
| SUBQ $0x10, R15 |
| JAE copy_5_loop |
| LEAQ 16(R14)(R15*1), R14 |
| LEAQ 16(BX)(R15*1), BX |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(BX) |
| JMP copy_5_end |
| |
| copy_5_small: |
| CMPQ R11, $0x03 |
| JE copy_5_move_3 |
| JB copy_5_move_1or2 |
| CMPQ R11, $0x08 |
| JB copy_5_move_4through7 |
| JMP copy_5_move_8through16 |
| |
| copy_5_move_1or2: |
| MOVB (R14), R15 |
| MOVB -1(R14)(R11*1), BP |
| MOVB R15, (BX) |
| MOVB BP, -1(BX)(R11*1) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| JMP copy_5_end |
| |
| copy_5_move_3: |
| MOVW (R14), R15 |
| MOVB 2(R14), BP |
| MOVW R15, (BX) |
| MOVB BP, 2(BX) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| JMP copy_5_end |
| |
| copy_5_move_4through7: |
| MOVL (R14), R15 |
| MOVL -4(R14)(R11*1), BP |
| MOVL R15, (BX) |
| MOVL BP, -4(BX)(R11*1) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| JMP copy_5_end |
| |
| copy_5_move_8through16: |
| MOVQ (R14), R15 |
| MOVQ -8(R14)(R11*1), BP |
| MOVQ R15, (BX) |
| MOVQ BP, -8(BX)(R11*1) |
| ADDQ R11, R14 |
| ADDQ R11, BX |
| |
| copy_5_end: |
| ADDQ R11, DI |
| SUBQ R11, R13 |
| |
| // Copy match from the current buffer |
| copy_match: |
| MOVQ BX, R11 |
| SUBQ R12, R11 |
| |
| // ml <= mo |
| CMPQ R13, R12 |
| JA copy_overlapping_match |
| |
| // Copy non-overlapping match |
| ADDQ R13, DI |
| MOVQ R13, R12 |
| SUBQ $0x10, R12 |
| JB copy_2_small |
| |
| copy_2_loop: |
| MOVUPS (R11), X0 |
| MOVUPS X0, (BX) |
| ADDQ $0x10, R11 |
| ADDQ $0x10, BX |
| SUBQ $0x10, R12 |
| JAE copy_2_loop |
| LEAQ 16(R11)(R12*1), R11 |
| LEAQ 16(BX)(R12*1), BX |
| MOVUPS -16(R11), X0 |
| MOVUPS X0, -16(BX) |
| JMP copy_2_end |
| |
| copy_2_small: |
| CMPQ R13, $0x03 |
| JE copy_2_move_3 |
| JB copy_2_move_1or2 |
| CMPQ R13, $0x08 |
| JB copy_2_move_4through7 |
| JMP copy_2_move_8through16 |
| |
| copy_2_move_1or2: |
| MOVB (R11), R12 |
| MOVB -1(R11)(R13*1), R14 |
| MOVB R12, (BX) |
| MOVB R14, -1(BX)(R13*1) |
| ADDQ R13, R11 |
| ADDQ R13, BX |
| JMP copy_2_end |
| |
| copy_2_move_3: |
| MOVW (R11), R12 |
| MOVB 2(R11), R14 |
| MOVW R12, (BX) |
| MOVB R14, 2(BX) |
| ADDQ R13, R11 |
| ADDQ R13, BX |
| JMP copy_2_end |
| |
| copy_2_move_4through7: |
| MOVL (R11), R12 |
| MOVL -4(R11)(R13*1), R14 |
| MOVL R12, (BX) |
| MOVL R14, -4(BX)(R13*1) |
| ADDQ R13, R11 |
| ADDQ R13, BX |
| JMP copy_2_end |
| |
| copy_2_move_8through16: |
| MOVQ (R11), R12 |
| MOVQ -8(R11)(R13*1), R14 |
| MOVQ R12, (BX) |
| MOVQ R14, -8(BX)(R13*1) |
| ADDQ R13, R11 |
| ADDQ R13, BX |
| |
| copy_2_end: |
| JMP handle_loop |
| |
| // Copy overlapping match |
| copy_overlapping_match: |
| ADDQ R13, DI |
| |
| copy_slow_3: |
| MOVB (R11), R12 |
| MOVB R12, (BX) |
| INCQ R11 |
| INCQ BX |
| DECQ R13 |
| JNZ copy_slow_3 |
| |
| handle_loop: |
| ADDQ $0x18, AX |
| INCQ DX |
| CMPQ DX, CX |
| JB main_loop |
| |
| loop_finished: |
| // Return value |
| MOVB $0x01, ret+8(FP) |
| |
| // Update the context |
| MOVQ ctx+0(FP), AX |
| MOVQ DX, 24(AX) |
| MOVQ DI, 104(AX) |
| SUBQ 80(AX), SI |
| MOVQ SI, 112(AX) |
| RET |
| |
| error_match_off_too_big: |
| // Return value |
| MOVB $0x00, ret+8(FP) |
| |
| // Update the context |
| MOVQ ctx+0(FP), AX |
| MOVQ DX, 24(AX) |
| MOVQ DI, 104(AX) |
| SUBQ 80(AX), SI |
| MOVQ SI, 112(AX) |
| RET |
| |
| empty_seqs: |
| // Return value |
| MOVB $0x01, ret+8(FP) |
| RET |
| |
| // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
| // Requires: CMOV, SSE |
| TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 |
| MOVQ br+8(FP), CX |
| MOVQ 24(CX), DX |
| MOVBQZX 32(CX), BX |
| MOVQ (CX), AX |
| MOVQ 8(CX), SI |
| ADDQ SI, AX |
| MOVQ AX, (SP) |
| MOVQ ctx+16(FP), AX |
| MOVQ 72(AX), DI |
| MOVQ 80(AX), R8 |
| MOVQ 88(AX), R9 |
| XORQ CX, CX |
| MOVQ CX, 8(SP) |
| MOVQ CX, 16(SP) |
| MOVQ CX, 24(SP) |
| MOVQ 112(AX), R10 |
| MOVQ 128(AX), CX |
| MOVQ CX, 32(SP) |
| MOVQ 144(AX), R11 |
| MOVQ 136(AX), R12 |
| MOVQ 200(AX), CX |
| MOVQ CX, 56(SP) |
| MOVQ 176(AX), CX |
| MOVQ CX, 48(SP) |
| MOVQ 184(AX), AX |
| MOVQ AX, 40(SP) |
| MOVQ 40(SP), AX |
| ADDQ AX, 48(SP) |
| |
| // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) |
| ADDQ R10, 32(SP) |
| |
| // outBase += outPosition |
| ADDQ R12, R10 |
| |
| sequenceDecs_decodeSync_amd64_main_loop: |
| MOVQ (SP), R13 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R13 |
| MOVQ (R13), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decodeSync_amd64_fill_end |
| |
| sequenceDecs_decodeSync_amd64_fill_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decodeSync_amd64_fill_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decodeSync_amd64_fill_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R13), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte |
| |
| sequenceDecs_decodeSync_amd64_fill_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_amd64_fill_end: |
| // Update offset |
| MOVQ R9, AX |
| MOVQ BX, CX |
| MOVQ DX, R14 |
| SHLQ CL, R14 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decodeSync_amd64_of_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decodeSync_amd64_of_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decodeSync_amd64_of_update_zero |
| NEGQ CX |
| SHRQ CL, R14 |
| ADDQ R14, AX |
| |
| sequenceDecs_decodeSync_amd64_of_update_zero: |
| MOVQ AX, 8(SP) |
| |
| // Update match length |
| MOVQ R8, AX |
| MOVQ BX, CX |
| MOVQ DX, R14 |
| SHLQ CL, R14 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decodeSync_amd64_ml_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decodeSync_amd64_ml_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decodeSync_amd64_ml_update_zero |
| NEGQ CX |
| SHRQ CL, R14 |
| ADDQ R14, AX |
| |
| sequenceDecs_decodeSync_amd64_ml_update_zero: |
| MOVQ AX, 16(SP) |
| |
| // Fill bitreader to have enough for the remaining |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R13 |
| MOVQ (R13), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decodeSync_amd64_fill_2_end |
| |
| sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decodeSync_amd64_fill_2_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R13), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte |
| |
| sequenceDecs_decodeSync_amd64_fill_2_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_amd64_fill_2_end: |
| // Update literal length |
| MOVQ DI, AX |
| MOVQ BX, CX |
| MOVQ DX, R14 |
| SHLQ CL, R14 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decodeSync_amd64_ll_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decodeSync_amd64_ll_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decodeSync_amd64_ll_update_zero |
| NEGQ CX |
| SHRQ CL, R14 |
| ADDQ R14, AX |
| |
| sequenceDecs_decodeSync_amd64_ll_update_zero: |
| MOVQ AX, 24(SP) |
| |
| // Fill bitreader for state updates |
| MOVQ R13, (SP) |
| MOVQ R9, AX |
| SHRQ $0x08, AX |
| MOVBQZX AL, AX |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decodeSync_amd64_skip_update |
| |
| // Update Literal Length State |
| MOVBQZX DI, R13 |
| SHRL $0x10, DI |
| LEAQ (BX)(R13*1), CX |
| MOVQ DX, R14 |
| MOVQ CX, BX |
| ROLQ CL, R14 |
| MOVL $0x00000001, R15 |
| MOVB R13, CL |
| SHLL CL, R15 |
| DECL R15 |
| ANDQ R15, R14 |
| ADDQ R14, DI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Match Length State |
| MOVBQZX R8, R13 |
| SHRL $0x10, R8 |
| LEAQ (BX)(R13*1), CX |
| MOVQ DX, R14 |
| MOVQ CX, BX |
| ROLQ CL, R14 |
| MOVL $0x00000001, R15 |
| MOVB R13, CL |
| SHLL CL, R15 |
| DECL R15 |
| ANDQ R15, R14 |
| ADDQ R14, R8 |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Offset State |
| MOVBQZX R9, R13 |
| SHRL $0x10, R9 |
| LEAQ (BX)(R13*1), CX |
| MOVQ DX, R14 |
| MOVQ CX, BX |
| ROLQ CL, R14 |
| MOVL $0x00000001, R15 |
| MOVB R13, CL |
| SHLL CL, R15 |
| DECL R15 |
| ANDQ R15, R14 |
| ADDQ R14, R9 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R9*8), R9 |
| |
| sequenceDecs_decodeSync_amd64_skip_update: |
| // Adjust offset |
| MOVQ s+0(FP), CX |
| MOVQ 8(SP), R13 |
| CMPQ AX, $0x01 |
| JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 |
| MOVUPS 144(CX), X0 |
| MOVQ R13, 144(CX) |
| MOVUPS X0, 152(CX) |
| JMP sequenceDecs_decodeSync_amd64_after_adjust |
| |
| sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: |
| CMPQ 24(SP), $0x00000000 |
| JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero |
| INCQ R13 |
| JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero |
| |
| sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero |
| MOVQ 144(CX), R13 |
| JMP sequenceDecs_decodeSync_amd64_after_adjust |
| |
| sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: |
| MOVQ R13, AX |
| XORQ R14, R14 |
| MOVQ $-1, R15 |
| CMPQ R13, $0x03 |
| CMOVQEQ R14, AX |
| CMOVQEQ R15, R14 |
| ADDQ 144(CX)(AX*8), R14 |
| JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid |
| MOVQ $0x00000001, R14 |
| |
| sequenceDecs_decodeSync_amd64_adjust_temp_valid: |
| CMPQ R13, $0x01 |
| JZ sequenceDecs_decodeSync_amd64_adjust_skip |
| MOVQ 152(CX), AX |
| MOVQ AX, 160(CX) |
| |
| sequenceDecs_decodeSync_amd64_adjust_skip: |
| MOVQ 144(CX), AX |
| MOVQ AX, 152(CX) |
| MOVQ R14, 144(CX) |
| MOVQ R14, R13 |
| |
| sequenceDecs_decodeSync_amd64_after_adjust: |
| MOVQ R13, 8(SP) |
| |
| // Check values |
| MOVQ 16(SP), AX |
| MOVQ 24(SP), CX |
| LEAQ (AX)(CX*1), R14 |
| MOVQ s+0(FP), R15 |
| ADDQ R14, 256(R15) |
| MOVQ ctx+16(FP), R14 |
| SUBQ CX, 104(R14) |
| JS error_not_enough_literals |
| CMPQ AX, $0x00020002 |
| JA sequenceDecs_decodeSync_amd64_error_match_len_too_big |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok |
| TESTQ AX, AX |
| JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decodeSync_amd64_match_len_ofs_ok: |
| MOVQ 24(SP), AX |
| MOVQ 8(SP), CX |
| MOVQ 16(SP), R13 |
| |
| // Check if we have enough space in s.out |
| LEAQ (AX)(R13*1), R14 |
| ADDQ R10, R14 |
| CMPQ R14, 32(SP) |
| JA error_not_enough_space |
| |
| // Copy literals |
| TESTQ AX, AX |
| JZ check_offset |
| XORQ R14, R14 |
| |
| copy_1: |
| MOVUPS (R11)(R14*1), X0 |
| MOVUPS X0, (R10)(R14*1) |
| ADDQ $0x10, R14 |
| CMPQ R14, AX |
| JB copy_1 |
| ADDQ AX, R11 |
| ADDQ AX, R10 |
| ADDQ AX, R12 |
| |
| // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
| check_offset: |
| MOVQ R12, AX |
| ADDQ 40(SP), AX |
| CMPQ CX, AX |
| JG error_match_off_too_big |
| CMPQ CX, 56(SP) |
| JG error_match_off_too_big |
| |
| // Copy match from history |
| MOVQ CX, AX |
| SUBQ R12, AX |
| JLS copy_match |
| MOVQ 48(SP), R14 |
| SUBQ AX, R14 |
| CMPQ R13, AX |
| JG copy_all_from_history |
| MOVQ R13, AX |
| SUBQ $0x10, AX |
| JB copy_4_small |
| |
| copy_4_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R10) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R10 |
| SUBQ $0x10, AX |
| JAE copy_4_loop |
| LEAQ 16(R14)(AX*1), R14 |
| LEAQ 16(R10)(AX*1), R10 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R10) |
| JMP copy_4_end |
| |
| copy_4_small: |
| CMPQ R13, $0x03 |
| JE copy_4_move_3 |
| CMPQ R13, $0x08 |
| JB copy_4_move_4through7 |
| JMP copy_4_move_8through16 |
| |
| copy_4_move_3: |
| MOVW (R14), AX |
| MOVB 2(R14), CL |
| MOVW AX, (R10) |
| MOVB CL, 2(R10) |
| ADDQ R13, R14 |
| ADDQ R13, R10 |
| JMP copy_4_end |
| |
| copy_4_move_4through7: |
| MOVL (R14), AX |
| MOVL -4(R14)(R13*1), CX |
| MOVL AX, (R10) |
| MOVL CX, -4(R10)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R10 |
| JMP copy_4_end |
| |
| copy_4_move_8through16: |
| MOVQ (R14), AX |
| MOVQ -8(R14)(R13*1), CX |
| MOVQ AX, (R10) |
| MOVQ CX, -8(R10)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R10 |
| |
| copy_4_end: |
| ADDQ R13, R12 |
| JMP handle_loop |
| JMP loop_finished |
| |
| copy_all_from_history: |
| MOVQ AX, R15 |
| SUBQ $0x10, R15 |
| JB copy_5_small |
| |
| copy_5_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R10) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R10 |
| SUBQ $0x10, R15 |
| JAE copy_5_loop |
| LEAQ 16(R14)(R15*1), R14 |
| LEAQ 16(R10)(R15*1), R10 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R10) |
| JMP copy_5_end |
| |
| copy_5_small: |
| CMPQ AX, $0x03 |
| JE copy_5_move_3 |
| JB copy_5_move_1or2 |
| CMPQ AX, $0x08 |
| JB copy_5_move_4through7 |
| JMP copy_5_move_8through16 |
| |
| copy_5_move_1or2: |
| MOVB (R14), R15 |
| MOVB -1(R14)(AX*1), BP |
| MOVB R15, (R10) |
| MOVB BP, -1(R10)(AX*1) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| JMP copy_5_end |
| |
| copy_5_move_3: |
| MOVW (R14), R15 |
| MOVB 2(R14), BP |
| MOVW R15, (R10) |
| MOVB BP, 2(R10) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| JMP copy_5_end |
| |
| copy_5_move_4through7: |
| MOVL (R14), R15 |
| MOVL -4(R14)(AX*1), BP |
| MOVL R15, (R10) |
| MOVL BP, -4(R10)(AX*1) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| JMP copy_5_end |
| |
| copy_5_move_8through16: |
| MOVQ (R14), R15 |
| MOVQ -8(R14)(AX*1), BP |
| MOVQ R15, (R10) |
| MOVQ BP, -8(R10)(AX*1) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| |
| copy_5_end: |
| ADDQ AX, R12 |
| SUBQ AX, R13 |
| |
| // Copy match from the current buffer |
| copy_match: |
| MOVQ R10, AX |
| SUBQ CX, AX |
| |
| // ml <= mo |
| CMPQ R13, CX |
| JA copy_overlapping_match |
| |
| // Copy non-overlapping match |
| ADDQ R13, R12 |
| MOVQ R10, CX |
| ADDQ R13, R10 |
| |
| copy_2: |
| MOVUPS (AX), X0 |
| MOVUPS X0, (CX) |
| ADDQ $0x10, AX |
| ADDQ $0x10, CX |
| SUBQ $0x10, R13 |
| JHI copy_2 |
| JMP handle_loop |
| |
| // Copy overlapping match |
| copy_overlapping_match: |
| ADDQ R13, R12 |
| |
| copy_slow_3: |
| MOVB (AX), CL |
| MOVB CL, (R10) |
| INCQ AX |
| INCQ R10 |
| DECQ R13 |
| JNZ copy_slow_3 |
| |
| handle_loop: |
| MOVQ ctx+16(FP), AX |
| DECQ 96(AX) |
| JNS sequenceDecs_decodeSync_amd64_main_loop |
| |
| loop_finished: |
| MOVQ br+8(FP), AX |
| MOVQ DX, 24(AX) |
| MOVB BL, 32(AX) |
| MOVQ SI, 8(AX) |
| |
| // Update the context |
| MOVQ ctx+16(FP), AX |
| MOVQ R12, 136(AX) |
| MOVQ 144(AX), CX |
| SUBQ CX, R11 |
| MOVQ R11, 168(AX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: |
| MOVQ 16(SP), AX |
| MOVQ ctx+16(FP), CX |
| MOVQ AX, 216(CX) |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decodeSync_amd64_error_match_len_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| error_match_off_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 8(SP), CX |
| MOVQ CX, 224(AX) |
| MOVQ R12, 136(AX) |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // Return with not enough output space error |
| error_not_enough_space: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ R12, 136(AX) |
| MOVQ $0x00000005, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
| // Requires: BMI, BMI2, CMOV, SSE |
| TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 |
| MOVQ br+8(FP), BX |
| MOVQ 24(BX), AX |
| MOVBQZX 32(BX), DX |
| MOVQ (BX), CX |
| MOVQ 8(BX), BX |
| ADDQ BX, CX |
| MOVQ CX, (SP) |
| MOVQ ctx+16(FP), CX |
| MOVQ 72(CX), SI |
| MOVQ 80(CX), DI |
| MOVQ 88(CX), R8 |
| XORQ R9, R9 |
| MOVQ R9, 8(SP) |
| MOVQ R9, 16(SP) |
| MOVQ R9, 24(SP) |
| MOVQ 112(CX), R9 |
| MOVQ 128(CX), R10 |
| MOVQ R10, 32(SP) |
| MOVQ 144(CX), R10 |
| MOVQ 136(CX), R11 |
| MOVQ 200(CX), R12 |
| MOVQ R12, 56(SP) |
| MOVQ 176(CX), R12 |
| MOVQ R12, 48(SP) |
| MOVQ 184(CX), CX |
| MOVQ CX, 40(SP) |
| MOVQ 40(SP), CX |
| ADDQ CX, 48(SP) |
| |
| // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) |
| ADDQ R9, 32(SP) |
| |
| // outBase += outPosition |
| ADDQ R11, R9 |
| |
| sequenceDecs_decodeSync_bmi2_main_loop: |
| MOVQ (SP), R12 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R12 |
| MOVQ (R12), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decodeSync_bmi2_fill_end |
| |
| sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decodeSync_bmi2_fill_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decodeSync_bmi2_fill_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R12 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R12), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte |
| |
| sequenceDecs_decodeSync_bmi2_fill_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_bmi2_fill_end: |
| // Update offset |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R13 |
| MOVQ AX, R14 |
| LEAQ (DX)(R13*1), CX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| MOVQ CX, DX |
| MOVQ R8, CX |
| SHRQ $0x20, CX |
| ADDQ R14, CX |
| MOVQ CX, 8(SP) |
| |
| // Update match length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, DI, R13 |
| MOVQ AX, R14 |
| LEAQ (DX)(R13*1), CX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| MOVQ CX, DX |
| MOVQ DI, CX |
| SHRQ $0x20, CX |
| ADDQ R14, CX |
| MOVQ CX, 16(SP) |
| |
| // Fill bitreader to have enough for the remaining |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R12 |
| MOVQ (R12), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decodeSync_bmi2_fill_2_end |
| |
| sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decodeSync_bmi2_fill_2_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R12 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R12), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte |
| |
| sequenceDecs_decodeSync_bmi2_fill_2_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_bmi2_fill_2_end: |
| // Update literal length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, SI, R13 |
| MOVQ AX, R14 |
| LEAQ (DX)(R13*1), CX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| MOVQ CX, DX |
| MOVQ SI, CX |
| SHRQ $0x20, CX |
| ADDQ R14, CX |
| MOVQ CX, 24(SP) |
| |
| // Fill bitreader for state updates |
| MOVQ R12, (SP) |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R12 |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decodeSync_bmi2_skip_update |
| LEAQ (SI)(DI*1), R13 |
| ADDQ R8, R13 |
| MOVBQZX R13, R13 |
| LEAQ (DX)(R13*1), CX |
| MOVQ AX, R14 |
| MOVQ CX, DX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| |
| // Update Offset State |
| BZHIQ R8, R14, CX |
| SHRXQ R8, R14, R14 |
| SHRL $0x10, R8 |
| ADDQ CX, R8 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Match Length State |
| BZHIQ DI, R14, CX |
| SHRXQ DI, R14, R14 |
| SHRL $0x10, DI |
| ADDQ CX, DI |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Literal Length State |
| BZHIQ SI, R14, CX |
| SHRL $0x10, SI |
| ADDQ CX, SI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(SI*8), SI |
| |
| sequenceDecs_decodeSync_bmi2_skip_update: |
| // Adjust offset |
| MOVQ s+0(FP), CX |
| MOVQ 8(SP), R13 |
| CMPQ R12, $0x01 |
| JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 |
| MOVUPS 144(CX), X0 |
| MOVQ R13, 144(CX) |
| MOVUPS X0, 152(CX) |
| JMP sequenceDecs_decodeSync_bmi2_after_adjust |
| |
| sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: |
| CMPQ 24(SP), $0x00000000 |
| JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero |
| INCQ R13 |
| JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero |
| |
| sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero |
| MOVQ 144(CX), R13 |
| JMP sequenceDecs_decodeSync_bmi2_after_adjust |
| |
| sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: |
| MOVQ R13, R12 |
| XORQ R14, R14 |
| MOVQ $-1, R15 |
| CMPQ R13, $0x03 |
| CMOVQEQ R14, R12 |
| CMOVQEQ R15, R14 |
| ADDQ 144(CX)(R12*8), R14 |
| JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid |
| MOVQ $0x00000001, R14 |
| |
| sequenceDecs_decodeSync_bmi2_adjust_temp_valid: |
| CMPQ R13, $0x01 |
| JZ sequenceDecs_decodeSync_bmi2_adjust_skip |
| MOVQ 152(CX), R12 |
| MOVQ R12, 160(CX) |
| |
| sequenceDecs_decodeSync_bmi2_adjust_skip: |
| MOVQ 144(CX), R12 |
| MOVQ R12, 152(CX) |
| MOVQ R14, 144(CX) |
| MOVQ R14, R13 |
| |
| sequenceDecs_decodeSync_bmi2_after_adjust: |
| MOVQ R13, 8(SP) |
| |
| // Check values |
| MOVQ 16(SP), CX |
| MOVQ 24(SP), R12 |
| LEAQ (CX)(R12*1), R14 |
| MOVQ s+0(FP), R15 |
| ADDQ R14, 256(R15) |
| MOVQ ctx+16(FP), R14 |
| SUBQ R12, 104(R14) |
| JS error_not_enough_literals |
| CMPQ CX, $0x00020002 |
| JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok |
| TESTQ CX, CX |
| JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: |
| MOVQ 24(SP), CX |
| MOVQ 8(SP), R12 |
| MOVQ 16(SP), R13 |
| |
| // Check if we have enough space in s.out |
| LEAQ (CX)(R13*1), R14 |
| ADDQ R9, R14 |
| CMPQ R14, 32(SP) |
| JA error_not_enough_space |
| |
| // Copy literals |
| TESTQ CX, CX |
| JZ check_offset |
| XORQ R14, R14 |
| |
| copy_1: |
| MOVUPS (R10)(R14*1), X0 |
| MOVUPS X0, (R9)(R14*1) |
| ADDQ $0x10, R14 |
| CMPQ R14, CX |
| JB copy_1 |
| ADDQ CX, R10 |
| ADDQ CX, R9 |
| ADDQ CX, R11 |
| |
| // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
| check_offset: |
| MOVQ R11, CX |
| ADDQ 40(SP), CX |
| CMPQ R12, CX |
| JG error_match_off_too_big |
| CMPQ R12, 56(SP) |
| JG error_match_off_too_big |
| |
| // Copy match from history |
| MOVQ R12, CX |
| SUBQ R11, CX |
| JLS copy_match |
| MOVQ 48(SP), R14 |
| SUBQ CX, R14 |
| CMPQ R13, CX |
| JG copy_all_from_history |
| MOVQ R13, CX |
| SUBQ $0x10, CX |
| JB copy_4_small |
| |
| copy_4_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R9) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R9 |
| SUBQ $0x10, CX |
| JAE copy_4_loop |
| LEAQ 16(R14)(CX*1), R14 |
| LEAQ 16(R9)(CX*1), R9 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R9) |
| JMP copy_4_end |
| |
| copy_4_small: |
| CMPQ R13, $0x03 |
| JE copy_4_move_3 |
| CMPQ R13, $0x08 |
| JB copy_4_move_4through7 |
| JMP copy_4_move_8through16 |
| |
| copy_4_move_3: |
| MOVW (R14), CX |
| MOVB 2(R14), R12 |
| MOVW CX, (R9) |
| MOVB R12, 2(R9) |
| ADDQ R13, R14 |
| ADDQ R13, R9 |
| JMP copy_4_end |
| |
| copy_4_move_4through7: |
| MOVL (R14), CX |
| MOVL -4(R14)(R13*1), R12 |
| MOVL CX, (R9) |
| MOVL R12, -4(R9)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R9 |
| JMP copy_4_end |
| |
| copy_4_move_8through16: |
| MOVQ (R14), CX |
| MOVQ -8(R14)(R13*1), R12 |
| MOVQ CX, (R9) |
| MOVQ R12, -8(R9)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R9 |
| |
| copy_4_end: |
| ADDQ R13, R11 |
| JMP handle_loop |
| JMP loop_finished |
| |
| copy_all_from_history: |
| MOVQ CX, R15 |
| SUBQ $0x10, R15 |
| JB copy_5_small |
| |
| copy_5_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R9) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R9 |
| SUBQ $0x10, R15 |
| JAE copy_5_loop |
| LEAQ 16(R14)(R15*1), R14 |
| LEAQ 16(R9)(R15*1), R9 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R9) |
| JMP copy_5_end |
| |
| copy_5_small: |
| CMPQ CX, $0x03 |
| JE copy_5_move_3 |
| JB copy_5_move_1or2 |
| CMPQ CX, $0x08 |
| JB copy_5_move_4through7 |
| JMP copy_5_move_8through16 |
| |
| copy_5_move_1or2: |
| MOVB (R14), R15 |
| MOVB -1(R14)(CX*1), BP |
| MOVB R15, (R9) |
| MOVB BP, -1(R9)(CX*1) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| JMP copy_5_end |
| |
| copy_5_move_3: |
| MOVW (R14), R15 |
| MOVB 2(R14), BP |
| MOVW R15, (R9) |
| MOVB BP, 2(R9) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| JMP copy_5_end |
| |
| copy_5_move_4through7: |
| MOVL (R14), R15 |
| MOVL -4(R14)(CX*1), BP |
| MOVL R15, (R9) |
| MOVL BP, -4(R9)(CX*1) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| JMP copy_5_end |
| |
| copy_5_move_8through16: |
| MOVQ (R14), R15 |
| MOVQ -8(R14)(CX*1), BP |
| MOVQ R15, (R9) |
| MOVQ BP, -8(R9)(CX*1) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| |
| copy_5_end: |
| ADDQ CX, R11 |
| SUBQ CX, R13 |
| |
| // Copy match from the current buffer |
| copy_match: |
| MOVQ R9, CX |
| SUBQ R12, CX |
| |
| // ml <= mo |
| CMPQ R13, R12 |
| JA copy_overlapping_match |
| |
| // Copy non-overlapping match |
| ADDQ R13, R11 |
| MOVQ R9, R12 |
| ADDQ R13, R9 |
| |
| copy_2: |
| MOVUPS (CX), X0 |
| MOVUPS X0, (R12) |
| ADDQ $0x10, CX |
| ADDQ $0x10, R12 |
| SUBQ $0x10, R13 |
| JHI copy_2 |
| JMP handle_loop |
| |
| // Copy overlapping match |
| copy_overlapping_match: |
| ADDQ R13, R11 |
| |
| copy_slow_3: |
| MOVB (CX), R12 |
| MOVB R12, (R9) |
| INCQ CX |
| INCQ R9 |
| DECQ R13 |
| JNZ copy_slow_3 |
| |
| handle_loop: |
| MOVQ ctx+16(FP), CX |
| DECQ 96(CX) |
| JNS sequenceDecs_decodeSync_bmi2_main_loop |
| |
| loop_finished: |
| MOVQ br+8(FP), CX |
| MOVQ AX, 24(CX) |
| MOVB DL, 32(CX) |
| MOVQ BX, 8(CX) |
| |
| // Update the context |
| MOVQ ctx+16(FP), AX |
| MOVQ R11, 136(AX) |
| MOVQ 144(AX), CX |
| SUBQ CX, R10 |
| MOVQ R10, 168(AX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: |
| MOVQ 16(SP), AX |
| MOVQ ctx+16(FP), CX |
| MOVQ AX, 216(CX) |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decodeSync_bmi2_error_match_len_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| error_match_off_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 8(SP), CX |
| MOVQ CX, 224(AX) |
| MOVQ R11, 136(AX) |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // Return with not enough output space error |
| error_not_enough_space: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ R11, 136(AX) |
| MOVQ $0x00000005, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
| // Requires: CMOV, SSE |
| TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 |
| MOVQ br+8(FP), CX |
| MOVQ 24(CX), DX |
| MOVBQZX 32(CX), BX |
| MOVQ (CX), AX |
| MOVQ 8(CX), SI |
| ADDQ SI, AX |
| MOVQ AX, (SP) |
| MOVQ ctx+16(FP), AX |
| MOVQ 72(AX), DI |
| MOVQ 80(AX), R8 |
| MOVQ 88(AX), R9 |
| XORQ CX, CX |
| MOVQ CX, 8(SP) |
| MOVQ CX, 16(SP) |
| MOVQ CX, 24(SP) |
| MOVQ 112(AX), R10 |
| MOVQ 128(AX), CX |
| MOVQ CX, 32(SP) |
| MOVQ 144(AX), R11 |
| MOVQ 136(AX), R12 |
| MOVQ 200(AX), CX |
| MOVQ CX, 56(SP) |
| MOVQ 176(AX), CX |
| MOVQ CX, 48(SP) |
| MOVQ 184(AX), AX |
| MOVQ AX, 40(SP) |
| MOVQ 40(SP), AX |
| ADDQ AX, 48(SP) |
| |
| // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) |
| ADDQ R10, 32(SP) |
| |
| // outBase += outPosition |
| ADDQ R12, R10 |
| |
| sequenceDecs_decodeSync_safe_amd64_main_loop: |
| MOVQ (SP), R13 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R13 |
| MOVQ (R13), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decodeSync_safe_amd64_fill_end |
| |
| sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decodeSync_safe_amd64_fill_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R13), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte |
| |
| sequenceDecs_decodeSync_safe_amd64_fill_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_safe_amd64_fill_end: |
| // Update offset |
| MOVQ R9, AX |
| MOVQ BX, CX |
| MOVQ DX, R14 |
| SHLQ CL, R14 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decodeSync_safe_amd64_of_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero |
| NEGQ CX |
| SHRQ CL, R14 |
| ADDQ R14, AX |
| |
| sequenceDecs_decodeSync_safe_amd64_of_update_zero: |
| MOVQ AX, 8(SP) |
| |
| // Update match length |
| MOVQ R8, AX |
| MOVQ BX, CX |
| MOVQ DX, R14 |
| SHLQ CL, R14 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero |
| NEGQ CX |
| SHRQ CL, R14 |
| ADDQ R14, AX |
| |
| sequenceDecs_decodeSync_safe_amd64_ml_update_zero: |
| MOVQ AX, 16(SP) |
| |
| // Fill bitreader to have enough for the remaining |
| CMPQ SI, $0x08 |
| JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte |
| MOVQ BX, AX |
| SHRQ $0x03, AX |
| SUBQ AX, R13 |
| MOVQ (R13), DX |
| SUBQ AX, SI |
| ANDQ $0x07, BX |
| JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end |
| |
| sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: |
| CMPQ SI, $0x00 |
| JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread |
| CMPQ BX, $0x07 |
| JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end |
| SHLQ $0x08, DX |
| SUBQ $0x01, R13 |
| SUBQ $0x01, SI |
| SUBQ $0x08, BX |
| MOVBQZX (R13), AX |
| ORQ AX, DX |
| JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte |
| |
| sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread: |
| CMPQ BX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_safe_amd64_fill_2_end: |
| // Update literal length |
| MOVQ DI, AX |
| MOVQ BX, CX |
| MOVQ DX, R14 |
| SHLQ CL, R14 |
| MOVB AH, CL |
| SHRQ $0x20, AX |
| TESTQ CX, CX |
| JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero |
| ADDQ CX, BX |
| CMPQ BX, $0x40 |
| JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero |
| CMPQ CX, $0x40 |
| JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero |
| NEGQ CX |
| SHRQ CL, R14 |
| ADDQ R14, AX |
| |
| sequenceDecs_decodeSync_safe_amd64_ll_update_zero: |
| MOVQ AX, 24(SP) |
| |
| // Fill bitreader for state updates |
| MOVQ R13, (SP) |
| MOVQ R9, AX |
| SHRQ $0x08, AX |
| MOVBQZX AL, AX |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decodeSync_safe_amd64_skip_update |
| |
| // Update Literal Length State |
| MOVBQZX DI, R13 |
| SHRL $0x10, DI |
| LEAQ (BX)(R13*1), CX |
| MOVQ DX, R14 |
| MOVQ CX, BX |
| ROLQ CL, R14 |
| MOVL $0x00000001, R15 |
| MOVB R13, CL |
| SHLL CL, R15 |
| DECL R15 |
| ANDQ R15, R14 |
| ADDQ R14, DI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Match Length State |
| MOVBQZX R8, R13 |
| SHRL $0x10, R8 |
| LEAQ (BX)(R13*1), CX |
| MOVQ DX, R14 |
| MOVQ CX, BX |
| ROLQ CL, R14 |
| MOVL $0x00000001, R15 |
| MOVB R13, CL |
| SHLL CL, R15 |
| DECL R15 |
| ANDQ R15, R14 |
| ADDQ R14, R8 |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Offset State |
| MOVBQZX R9, R13 |
| SHRL $0x10, R9 |
| LEAQ (BX)(R13*1), CX |
| MOVQ DX, R14 |
| MOVQ CX, BX |
| ROLQ CL, R14 |
| MOVL $0x00000001, R15 |
| MOVB R13, CL |
| SHLL CL, R15 |
| DECL R15 |
| ANDQ R15, R14 |
| ADDQ R14, R9 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R9*8), R9 |
| |
| sequenceDecs_decodeSync_safe_amd64_skip_update: |
| // Adjust offset |
| MOVQ s+0(FP), CX |
| MOVQ 8(SP), R13 |
| CMPQ AX, $0x01 |
| JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 |
| MOVUPS 144(CX), X0 |
| MOVQ R13, 144(CX) |
| MOVUPS X0, 152(CX) |
| JMP sequenceDecs_decodeSync_safe_amd64_after_adjust |
| |
| sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: |
| CMPQ 24(SP), $0x00000000 |
| JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero |
| INCQ R13 |
| JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero |
| |
| sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero |
| MOVQ 144(CX), R13 |
| JMP sequenceDecs_decodeSync_safe_amd64_after_adjust |
| |
| sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: |
| MOVQ R13, AX |
| XORQ R14, R14 |
| MOVQ $-1, R15 |
| CMPQ R13, $0x03 |
| CMOVQEQ R14, AX |
| CMOVQEQ R15, R14 |
| ADDQ 144(CX)(AX*8), R14 |
| JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid |
| MOVQ $0x00000001, R14 |
| |
| sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: |
| CMPQ R13, $0x01 |
| JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip |
| MOVQ 152(CX), AX |
| MOVQ AX, 160(CX) |
| |
| sequenceDecs_decodeSync_safe_amd64_adjust_skip: |
| MOVQ 144(CX), AX |
| MOVQ AX, 152(CX) |
| MOVQ R14, 144(CX) |
| MOVQ R14, R13 |
| |
| sequenceDecs_decodeSync_safe_amd64_after_adjust: |
| MOVQ R13, 8(SP) |
| |
| // Check values |
| MOVQ 16(SP), AX |
| MOVQ 24(SP), CX |
| LEAQ (AX)(CX*1), R14 |
| MOVQ s+0(FP), R15 |
| ADDQ R14, 256(R15) |
| MOVQ ctx+16(FP), R14 |
| SUBQ CX, 104(R14) |
| JS error_not_enough_literals |
| CMPQ AX, $0x00020002 |
| JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok |
| TESTQ AX, AX |
| JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: |
| MOVQ 24(SP), AX |
| MOVQ 8(SP), CX |
| MOVQ 16(SP), R13 |
| |
| // Check if we have enough space in s.out |
| LEAQ (AX)(R13*1), R14 |
| ADDQ R10, R14 |
| CMPQ R14, 32(SP) |
| JA error_not_enough_space |
| |
| // Copy literals |
| TESTQ AX, AX |
| JZ check_offset |
| MOVQ AX, R14 |
| SUBQ $0x10, R14 |
| JB copy_1_small |
| |
| copy_1_loop: |
| MOVUPS (R11), X0 |
| MOVUPS X0, (R10) |
| ADDQ $0x10, R11 |
| ADDQ $0x10, R10 |
| SUBQ $0x10, R14 |
| JAE copy_1_loop |
| LEAQ 16(R11)(R14*1), R11 |
| LEAQ 16(R10)(R14*1), R10 |
| MOVUPS -16(R11), X0 |
| MOVUPS X0, -16(R10) |
| JMP copy_1_end |
| |
| copy_1_small: |
| CMPQ AX, $0x03 |
| JE copy_1_move_3 |
| JB copy_1_move_1or2 |
| CMPQ AX, $0x08 |
| JB copy_1_move_4through7 |
| JMP copy_1_move_8through16 |
| |
| copy_1_move_1or2: |
| MOVB (R11), R14 |
| MOVB -1(R11)(AX*1), R15 |
| MOVB R14, (R10) |
| MOVB R15, -1(R10)(AX*1) |
| ADDQ AX, R11 |
| ADDQ AX, R10 |
| JMP copy_1_end |
| |
| copy_1_move_3: |
| MOVW (R11), R14 |
| MOVB 2(R11), R15 |
| MOVW R14, (R10) |
| MOVB R15, 2(R10) |
| ADDQ AX, R11 |
| ADDQ AX, R10 |
| JMP copy_1_end |
| |
| copy_1_move_4through7: |
| MOVL (R11), R14 |
| MOVL -4(R11)(AX*1), R15 |
| MOVL R14, (R10) |
| MOVL R15, -4(R10)(AX*1) |
| ADDQ AX, R11 |
| ADDQ AX, R10 |
| JMP copy_1_end |
| |
| copy_1_move_8through16: |
| MOVQ (R11), R14 |
| MOVQ -8(R11)(AX*1), R15 |
| MOVQ R14, (R10) |
| MOVQ R15, -8(R10)(AX*1) |
| ADDQ AX, R11 |
| ADDQ AX, R10 |
| |
| copy_1_end: |
| ADDQ AX, R12 |
| |
| // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
| check_offset: |
| MOVQ R12, AX |
| ADDQ 40(SP), AX |
| CMPQ CX, AX |
| JG error_match_off_too_big |
| CMPQ CX, 56(SP) |
| JG error_match_off_too_big |
| |
| // Copy match from history |
| MOVQ CX, AX |
| SUBQ R12, AX |
| JLS copy_match |
| MOVQ 48(SP), R14 |
| SUBQ AX, R14 |
| CMPQ R13, AX |
| JG copy_all_from_history |
| MOVQ R13, AX |
| SUBQ $0x10, AX |
| JB copy_4_small |
| |
| copy_4_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R10) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R10 |
| SUBQ $0x10, AX |
| JAE copy_4_loop |
| LEAQ 16(R14)(AX*1), R14 |
| LEAQ 16(R10)(AX*1), R10 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R10) |
| JMP copy_4_end |
| |
| copy_4_small: |
| CMPQ R13, $0x03 |
| JE copy_4_move_3 |
| CMPQ R13, $0x08 |
| JB copy_4_move_4through7 |
| JMP copy_4_move_8through16 |
| |
| copy_4_move_3: |
| MOVW (R14), AX |
| MOVB 2(R14), CL |
| MOVW AX, (R10) |
| MOVB CL, 2(R10) |
| ADDQ R13, R14 |
| ADDQ R13, R10 |
| JMP copy_4_end |
| |
| copy_4_move_4through7: |
| MOVL (R14), AX |
| MOVL -4(R14)(R13*1), CX |
| MOVL AX, (R10) |
| MOVL CX, -4(R10)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R10 |
| JMP copy_4_end |
| |
| copy_4_move_8through16: |
| MOVQ (R14), AX |
| MOVQ -8(R14)(R13*1), CX |
| MOVQ AX, (R10) |
| MOVQ CX, -8(R10)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R10 |
| |
| copy_4_end: |
| ADDQ R13, R12 |
| JMP handle_loop |
| JMP loop_finished |
| |
| copy_all_from_history: |
| MOVQ AX, R15 |
| SUBQ $0x10, R15 |
| JB copy_5_small |
| |
| copy_5_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R10) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R10 |
| SUBQ $0x10, R15 |
| JAE copy_5_loop |
| LEAQ 16(R14)(R15*1), R14 |
| LEAQ 16(R10)(R15*1), R10 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R10) |
| JMP copy_5_end |
| |
| copy_5_small: |
| CMPQ AX, $0x03 |
| JE copy_5_move_3 |
| JB copy_5_move_1or2 |
| CMPQ AX, $0x08 |
| JB copy_5_move_4through7 |
| JMP copy_5_move_8through16 |
| |
| copy_5_move_1or2: |
| MOVB (R14), R15 |
| MOVB -1(R14)(AX*1), BP |
| MOVB R15, (R10) |
| MOVB BP, -1(R10)(AX*1) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| JMP copy_5_end |
| |
| copy_5_move_3: |
| MOVW (R14), R15 |
| MOVB 2(R14), BP |
| MOVW R15, (R10) |
| MOVB BP, 2(R10) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| JMP copy_5_end |
| |
| copy_5_move_4through7: |
| MOVL (R14), R15 |
| MOVL -4(R14)(AX*1), BP |
| MOVL R15, (R10) |
| MOVL BP, -4(R10)(AX*1) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| JMP copy_5_end |
| |
| copy_5_move_8through16: |
| MOVQ (R14), R15 |
| MOVQ -8(R14)(AX*1), BP |
| MOVQ R15, (R10) |
| MOVQ BP, -8(R10)(AX*1) |
| ADDQ AX, R14 |
| ADDQ AX, R10 |
| |
| copy_5_end: |
| ADDQ AX, R12 |
| SUBQ AX, R13 |
| |
| // Copy match from the current buffer |
| copy_match: |
| MOVQ R10, AX |
| SUBQ CX, AX |
| |
| // ml <= mo |
| CMPQ R13, CX |
| JA copy_overlapping_match |
| |
| // Copy non-overlapping match |
| ADDQ R13, R12 |
| MOVQ R13, CX |
| SUBQ $0x10, CX |
| JB copy_2_small |
| |
| copy_2_loop: |
| MOVUPS (AX), X0 |
| MOVUPS X0, (R10) |
| ADDQ $0x10, AX |
| ADDQ $0x10, R10 |
| SUBQ $0x10, CX |
| JAE copy_2_loop |
| LEAQ 16(AX)(CX*1), AX |
| LEAQ 16(R10)(CX*1), R10 |
| MOVUPS -16(AX), X0 |
| MOVUPS X0, -16(R10) |
| JMP copy_2_end |
| |
| copy_2_small: |
| CMPQ R13, $0x03 |
| JE copy_2_move_3 |
| JB copy_2_move_1or2 |
| CMPQ R13, $0x08 |
| JB copy_2_move_4through7 |
| JMP copy_2_move_8through16 |
| |
| copy_2_move_1or2: |
| MOVB (AX), CL |
| MOVB -1(AX)(R13*1), R14 |
| MOVB CL, (R10) |
| MOVB R14, -1(R10)(R13*1) |
| ADDQ R13, AX |
| ADDQ R13, R10 |
| JMP copy_2_end |
| |
| copy_2_move_3: |
| MOVW (AX), CX |
| MOVB 2(AX), R14 |
| MOVW CX, (R10) |
| MOVB R14, 2(R10) |
| ADDQ R13, AX |
| ADDQ R13, R10 |
| JMP copy_2_end |
| |
| copy_2_move_4through7: |
| MOVL (AX), CX |
| MOVL -4(AX)(R13*1), R14 |
| MOVL CX, (R10) |
| MOVL R14, -4(R10)(R13*1) |
| ADDQ R13, AX |
| ADDQ R13, R10 |
| JMP copy_2_end |
| |
| copy_2_move_8through16: |
| MOVQ (AX), CX |
| MOVQ -8(AX)(R13*1), R14 |
| MOVQ CX, (R10) |
| MOVQ R14, -8(R10)(R13*1) |
| ADDQ R13, AX |
| ADDQ R13, R10 |
| |
| copy_2_end: |
| JMP handle_loop |
| |
| // Copy overlapping match |
| copy_overlapping_match: |
| ADDQ R13, R12 |
| |
| copy_slow_3: |
| MOVB (AX), CL |
| MOVB CL, (R10) |
| INCQ AX |
| INCQ R10 |
| DECQ R13 |
| JNZ copy_slow_3 |
| |
| handle_loop: |
| MOVQ ctx+16(FP), AX |
| DECQ 96(AX) |
| JNS sequenceDecs_decodeSync_safe_amd64_main_loop |
| |
| loop_finished: |
| MOVQ br+8(FP), AX |
| MOVQ DX, 24(AX) |
| MOVB BL, 32(AX) |
| MOVQ SI, 8(AX) |
| |
| // Update the context |
| MOVQ ctx+16(FP), AX |
| MOVQ R12, 136(AX) |
| MOVQ 144(AX), CX |
| SUBQ CX, R11 |
| MOVQ R11, 168(AX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: |
| MOVQ 16(SP), AX |
| MOVQ ctx+16(FP), CX |
| MOVQ AX, 216(CX) |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| error_match_off_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 8(SP), CX |
| MOVQ CX, 224(AX) |
| MOVQ R12, 136(AX) |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // Return with not enough output space error |
| error_not_enough_space: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ R12, 136(AX) |
| MOVQ $0x00000005, ret+24(FP) |
| RET |
| |
| // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int |
| // Requires: BMI, BMI2, CMOV, SSE |
| TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 |
| MOVQ br+8(FP), BX |
| MOVQ 24(BX), AX |
| MOVBQZX 32(BX), DX |
| MOVQ (BX), CX |
| MOVQ 8(BX), BX |
| ADDQ BX, CX |
| MOVQ CX, (SP) |
| MOVQ ctx+16(FP), CX |
| MOVQ 72(CX), SI |
| MOVQ 80(CX), DI |
| MOVQ 88(CX), R8 |
| XORQ R9, R9 |
| MOVQ R9, 8(SP) |
| MOVQ R9, 16(SP) |
| MOVQ R9, 24(SP) |
| MOVQ 112(CX), R9 |
| MOVQ 128(CX), R10 |
| MOVQ R10, 32(SP) |
| MOVQ 144(CX), R10 |
| MOVQ 136(CX), R11 |
| MOVQ 200(CX), R12 |
| MOVQ R12, 56(SP) |
| MOVQ 176(CX), R12 |
| MOVQ R12, 48(SP) |
| MOVQ 184(CX), CX |
| MOVQ CX, 40(SP) |
| MOVQ 40(SP), CX |
| ADDQ CX, 48(SP) |
| |
| // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) |
| ADDQ R9, 32(SP) |
| |
| // outBase += outPosition |
| ADDQ R11, R9 |
| |
| sequenceDecs_decodeSync_safe_bmi2_main_loop: |
| MOVQ (SP), R12 |
| |
| // Fill bitreader to have enough for the offset and match length. |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R12 |
| MOVQ (R12), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decodeSync_safe_bmi2_fill_end |
| |
| sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decodeSync_safe_bmi2_fill_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R12 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R12), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte |
| |
| sequenceDecs_decodeSync_safe_bmi2_fill_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_safe_bmi2_fill_end: |
| // Update offset |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R13 |
| MOVQ AX, R14 |
| LEAQ (DX)(R13*1), CX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| MOVQ CX, DX |
| MOVQ R8, CX |
| SHRQ $0x20, CX |
| ADDQ R14, CX |
| MOVQ CX, 8(SP) |
| |
| // Update match length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, DI, R13 |
| MOVQ AX, R14 |
| LEAQ (DX)(R13*1), CX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| MOVQ CX, DX |
| MOVQ DI, CX |
| SHRQ $0x20, CX |
| ADDQ R14, CX |
| MOVQ CX, 16(SP) |
| |
| // Fill bitreader to have enough for the remaining |
| CMPQ BX, $0x08 |
| JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte |
| MOVQ DX, CX |
| SHRQ $0x03, CX |
| SUBQ CX, R12 |
| MOVQ (R12), AX |
| SUBQ CX, BX |
| ANDQ $0x07, DX |
| JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end |
| |
| sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: |
| CMPQ BX, $0x00 |
| JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread |
| CMPQ DX, $0x07 |
| JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end |
| SHLQ $0x08, AX |
| SUBQ $0x01, R12 |
| SUBQ $0x01, BX |
| SUBQ $0x08, DX |
| MOVBQZX (R12), CX |
| ORQ CX, AX |
| JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte |
| |
| sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread: |
| CMPQ DX, $0x40 |
| JA error_overread |
| |
| sequenceDecs_decodeSync_safe_bmi2_fill_2_end: |
| // Update literal length |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, SI, R13 |
| MOVQ AX, R14 |
| LEAQ (DX)(R13*1), CX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| MOVQ CX, DX |
| MOVQ SI, CX |
| SHRQ $0x20, CX |
| ADDQ R14, CX |
| MOVQ CX, 24(SP) |
| |
| // Fill bitreader for state updates |
| MOVQ R12, (SP) |
| MOVQ $0x00000808, CX |
| BEXTRQ CX, R8, R12 |
| MOVQ ctx+16(FP), CX |
| CMPQ 96(CX), $0x00 |
| JZ sequenceDecs_decodeSync_safe_bmi2_skip_update |
| LEAQ (SI)(DI*1), R13 |
| ADDQ R8, R13 |
| MOVBQZX R13, R13 |
| LEAQ (DX)(R13*1), CX |
| MOVQ AX, R14 |
| MOVQ CX, DX |
| ROLQ CL, R14 |
| BZHIQ R13, R14, R14 |
| |
| // Update Offset State |
| BZHIQ R8, R14, CX |
| SHRXQ R8, R14, R14 |
| SHRL $0x10, R8 |
| ADDQ CX, R8 |
| |
| // Load ctx.ofTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 48(CX), CX |
| MOVQ (CX)(R8*8), R8 |
| |
| // Update Match Length State |
| BZHIQ DI, R14, CX |
| SHRXQ DI, R14, R14 |
| SHRL $0x10, DI |
| ADDQ CX, DI |
| |
| // Load ctx.mlTable |
| MOVQ ctx+16(FP), CX |
| MOVQ 24(CX), CX |
| MOVQ (CX)(DI*8), DI |
| |
| // Update Literal Length State |
| BZHIQ SI, R14, CX |
| SHRL $0x10, SI |
| ADDQ CX, SI |
| |
| // Load ctx.llTable |
| MOVQ ctx+16(FP), CX |
| MOVQ (CX), CX |
| MOVQ (CX)(SI*8), SI |
| |
| sequenceDecs_decodeSync_safe_bmi2_skip_update: |
| // Adjust offset |
| MOVQ s+0(FP), CX |
| MOVQ 8(SP), R13 |
| CMPQ R12, $0x01 |
| JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 |
| MOVUPS 144(CX), X0 |
| MOVQ R13, 144(CX) |
| MOVUPS X0, 152(CX) |
| JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust |
| |
| sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: |
| CMPQ 24(SP), $0x00000000 |
| JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero |
| INCQ R13 |
| JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero |
| |
| sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero |
| MOVQ 144(CX), R13 |
| JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust |
| |
| sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: |
| MOVQ R13, R12 |
| XORQ R14, R14 |
| MOVQ $-1, R15 |
| CMPQ R13, $0x03 |
| CMOVQEQ R14, R12 |
| CMOVQEQ R15, R14 |
| ADDQ 144(CX)(R12*8), R14 |
| JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid |
| MOVQ $0x00000001, R14 |
| |
| sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: |
| CMPQ R13, $0x01 |
| JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip |
| MOVQ 152(CX), R12 |
| MOVQ R12, 160(CX) |
| |
| sequenceDecs_decodeSync_safe_bmi2_adjust_skip: |
| MOVQ 144(CX), R12 |
| MOVQ R12, 152(CX) |
| MOVQ R14, 144(CX) |
| MOVQ R14, R13 |
| |
| sequenceDecs_decodeSync_safe_bmi2_after_adjust: |
| MOVQ R13, 8(SP) |
| |
| // Check values |
| MOVQ 16(SP), CX |
| MOVQ 24(SP), R12 |
| LEAQ (CX)(R12*1), R14 |
| MOVQ s+0(FP), R15 |
| ADDQ R14, 256(R15) |
| MOVQ ctx+16(FP), R14 |
| SUBQ R12, 104(R14) |
| JS error_not_enough_literals |
| CMPQ CX, $0x00020002 |
| JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big |
| TESTQ R13, R13 |
| JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok |
| TESTQ CX, CX |
| JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch |
| |
| sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: |
| MOVQ 24(SP), CX |
| MOVQ 8(SP), R12 |
| MOVQ 16(SP), R13 |
| |
| // Check if we have enough space in s.out |
| LEAQ (CX)(R13*1), R14 |
| ADDQ R9, R14 |
| CMPQ R14, 32(SP) |
| JA error_not_enough_space |
| |
| // Copy literals |
| TESTQ CX, CX |
| JZ check_offset |
| MOVQ CX, R14 |
| SUBQ $0x10, R14 |
| JB copy_1_small |
| |
| copy_1_loop: |
| MOVUPS (R10), X0 |
| MOVUPS X0, (R9) |
| ADDQ $0x10, R10 |
| ADDQ $0x10, R9 |
| SUBQ $0x10, R14 |
| JAE copy_1_loop |
| LEAQ 16(R10)(R14*1), R10 |
| LEAQ 16(R9)(R14*1), R9 |
| MOVUPS -16(R10), X0 |
| MOVUPS X0, -16(R9) |
| JMP copy_1_end |
| |
| copy_1_small: |
| CMPQ CX, $0x03 |
| JE copy_1_move_3 |
| JB copy_1_move_1or2 |
| CMPQ CX, $0x08 |
| JB copy_1_move_4through7 |
| JMP copy_1_move_8through16 |
| |
| copy_1_move_1or2: |
| MOVB (R10), R14 |
| MOVB -1(R10)(CX*1), R15 |
| MOVB R14, (R9) |
| MOVB R15, -1(R9)(CX*1) |
| ADDQ CX, R10 |
| ADDQ CX, R9 |
| JMP copy_1_end |
| |
| copy_1_move_3: |
| MOVW (R10), R14 |
| MOVB 2(R10), R15 |
| MOVW R14, (R9) |
| MOVB R15, 2(R9) |
| ADDQ CX, R10 |
| ADDQ CX, R9 |
| JMP copy_1_end |
| |
| copy_1_move_4through7: |
| MOVL (R10), R14 |
| MOVL -4(R10)(CX*1), R15 |
| MOVL R14, (R9) |
| MOVL R15, -4(R9)(CX*1) |
| ADDQ CX, R10 |
| ADDQ CX, R9 |
| JMP copy_1_end |
| |
| copy_1_move_8through16: |
| MOVQ (R10), R14 |
| MOVQ -8(R10)(CX*1), R15 |
| MOVQ R14, (R9) |
| MOVQ R15, -8(R9)(CX*1) |
| ADDQ CX, R10 |
| ADDQ CX, R9 |
| |
| copy_1_end: |
| ADDQ CX, R11 |
| |
| // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) |
| check_offset: |
| MOVQ R11, CX |
| ADDQ 40(SP), CX |
| CMPQ R12, CX |
| JG error_match_off_too_big |
| CMPQ R12, 56(SP) |
| JG error_match_off_too_big |
| |
| // Copy match from history |
| MOVQ R12, CX |
| SUBQ R11, CX |
| JLS copy_match |
| MOVQ 48(SP), R14 |
| SUBQ CX, R14 |
| CMPQ R13, CX |
| JG copy_all_from_history |
| MOVQ R13, CX |
| SUBQ $0x10, CX |
| JB copy_4_small |
| |
| copy_4_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R9) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R9 |
| SUBQ $0x10, CX |
| JAE copy_4_loop |
| LEAQ 16(R14)(CX*1), R14 |
| LEAQ 16(R9)(CX*1), R9 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R9) |
| JMP copy_4_end |
| |
| copy_4_small: |
| CMPQ R13, $0x03 |
| JE copy_4_move_3 |
| CMPQ R13, $0x08 |
| JB copy_4_move_4through7 |
| JMP copy_4_move_8through16 |
| |
| copy_4_move_3: |
| MOVW (R14), CX |
| MOVB 2(R14), R12 |
| MOVW CX, (R9) |
| MOVB R12, 2(R9) |
| ADDQ R13, R14 |
| ADDQ R13, R9 |
| JMP copy_4_end |
| |
| copy_4_move_4through7: |
| MOVL (R14), CX |
| MOVL -4(R14)(R13*1), R12 |
| MOVL CX, (R9) |
| MOVL R12, -4(R9)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R9 |
| JMP copy_4_end |
| |
| copy_4_move_8through16: |
| MOVQ (R14), CX |
| MOVQ -8(R14)(R13*1), R12 |
| MOVQ CX, (R9) |
| MOVQ R12, -8(R9)(R13*1) |
| ADDQ R13, R14 |
| ADDQ R13, R9 |
| |
| copy_4_end: |
| ADDQ R13, R11 |
| JMP handle_loop |
| JMP loop_finished |
| |
| copy_all_from_history: |
| MOVQ CX, R15 |
| SUBQ $0x10, R15 |
| JB copy_5_small |
| |
| copy_5_loop: |
| MOVUPS (R14), X0 |
| MOVUPS X0, (R9) |
| ADDQ $0x10, R14 |
| ADDQ $0x10, R9 |
| SUBQ $0x10, R15 |
| JAE copy_5_loop |
| LEAQ 16(R14)(R15*1), R14 |
| LEAQ 16(R9)(R15*1), R9 |
| MOVUPS -16(R14), X0 |
| MOVUPS X0, -16(R9) |
| JMP copy_5_end |
| |
| copy_5_small: |
| CMPQ CX, $0x03 |
| JE copy_5_move_3 |
| JB copy_5_move_1or2 |
| CMPQ CX, $0x08 |
| JB copy_5_move_4through7 |
| JMP copy_5_move_8through16 |
| |
| copy_5_move_1or2: |
| MOVB (R14), R15 |
| MOVB -1(R14)(CX*1), BP |
| MOVB R15, (R9) |
| MOVB BP, -1(R9)(CX*1) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| JMP copy_5_end |
| |
| copy_5_move_3: |
| MOVW (R14), R15 |
| MOVB 2(R14), BP |
| MOVW R15, (R9) |
| MOVB BP, 2(R9) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| JMP copy_5_end |
| |
| copy_5_move_4through7: |
| MOVL (R14), R15 |
| MOVL -4(R14)(CX*1), BP |
| MOVL R15, (R9) |
| MOVL BP, -4(R9)(CX*1) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| JMP copy_5_end |
| |
| copy_5_move_8through16: |
| MOVQ (R14), R15 |
| MOVQ -8(R14)(CX*1), BP |
| MOVQ R15, (R9) |
| MOVQ BP, -8(R9)(CX*1) |
| ADDQ CX, R14 |
| ADDQ CX, R9 |
| |
| copy_5_end: |
| ADDQ CX, R11 |
| SUBQ CX, R13 |
| |
| // Copy match from the current buffer |
| copy_match: |
| MOVQ R9, CX |
| SUBQ R12, CX |
| |
| // ml <= mo |
| CMPQ R13, R12 |
| JA copy_overlapping_match |
| |
| // Copy non-overlapping match |
| ADDQ R13, R11 |
| MOVQ R13, R12 |
| SUBQ $0x10, R12 |
| JB copy_2_small |
| |
| copy_2_loop: |
| MOVUPS (CX), X0 |
| MOVUPS X0, (R9) |
| ADDQ $0x10, CX |
| ADDQ $0x10, R9 |
| SUBQ $0x10, R12 |
| JAE copy_2_loop |
| LEAQ 16(CX)(R12*1), CX |
| LEAQ 16(R9)(R12*1), R9 |
| MOVUPS -16(CX), X0 |
| MOVUPS X0, -16(R9) |
| JMP copy_2_end |
| |
| copy_2_small: |
| CMPQ R13, $0x03 |
| JE copy_2_move_3 |
| JB copy_2_move_1or2 |
| CMPQ R13, $0x08 |
| JB copy_2_move_4through7 |
| JMP copy_2_move_8through16 |
| |
| copy_2_move_1or2: |
| MOVB (CX), R12 |
| MOVB -1(CX)(R13*1), R14 |
| MOVB R12, (R9) |
| MOVB R14, -1(R9)(R13*1) |
| ADDQ R13, CX |
| ADDQ R13, R9 |
| JMP copy_2_end |
| |
| copy_2_move_3: |
| MOVW (CX), R12 |
| MOVB 2(CX), R14 |
| MOVW R12, (R9) |
| MOVB R14, 2(R9) |
| ADDQ R13, CX |
| ADDQ R13, R9 |
| JMP copy_2_end |
| |
| copy_2_move_4through7: |
| MOVL (CX), R12 |
| MOVL -4(CX)(R13*1), R14 |
| MOVL R12, (R9) |
| MOVL R14, -4(R9)(R13*1) |
| ADDQ R13, CX |
| ADDQ R13, R9 |
| JMP copy_2_end |
| |
| copy_2_move_8through16: |
| MOVQ (CX), R12 |
| MOVQ -8(CX)(R13*1), R14 |
| MOVQ R12, (R9) |
| MOVQ R14, -8(R9)(R13*1) |
| ADDQ R13, CX |
| ADDQ R13, R9 |
| |
| copy_2_end: |
| JMP handle_loop |
| |
| // Copy overlapping match |
| copy_overlapping_match: |
| ADDQ R13, R11 |
| |
| copy_slow_3: |
| MOVB (CX), R12 |
| MOVB R12, (R9) |
| INCQ CX |
| INCQ R9 |
| DECQ R13 |
| JNZ copy_slow_3 |
| |
| handle_loop: |
| MOVQ ctx+16(FP), CX |
| DECQ 96(CX) |
| JNS sequenceDecs_decodeSync_safe_bmi2_main_loop |
| |
| loop_finished: |
| MOVQ br+8(FP), CX |
| MOVQ AX, 24(CX) |
| MOVB DL, 32(CX) |
| MOVQ BX, 8(CX) |
| |
| // Update the context |
| MOVQ ctx+16(FP), AX |
| MOVQ R11, 136(AX) |
| MOVQ 144(AX), CX |
| SUBQ CX, R10 |
| MOVQ R10, 168(AX) |
| |
| // Return success |
| MOVQ $0x00000000, ret+24(FP) |
| RET |
| |
| // Return with match length error |
| sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: |
| MOVQ 16(SP), AX |
| MOVQ ctx+16(FP), CX |
| MOVQ AX, 216(CX) |
| MOVQ $0x00000001, ret+24(FP) |
| RET |
| |
| // Return with match too long error |
| sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ $0x00000002, ret+24(FP) |
| RET |
| |
| // Return with match offset too long error |
| error_match_off_too_big: |
| MOVQ ctx+16(FP), AX |
| MOVQ 8(SP), CX |
| MOVQ CX, 224(AX) |
| MOVQ R11, 136(AX) |
| MOVQ $0x00000003, ret+24(FP) |
| RET |
| |
| // Return with not enough literals error |
| error_not_enough_literals: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ $0x00000004, ret+24(FP) |
| RET |
| |
| // Return with overread error |
| error_overread: |
| MOVQ $0x00000006, ret+24(FP) |
| RET |
| |
| // Return with not enough output space error |
| error_not_enough_space: |
| MOVQ ctx+16(FP), AX |
| MOVQ 24(SP), CX |
| MOVQ CX, 208(AX) |
| MOVQ 16(SP), CX |
| MOVQ CX, 216(AX) |
| MOVQ R11, 136(AX) |
| MOVQ $0x00000005, ret+24(FP) |
| RET |