[ 3.8.y.z extended stable ] Patch "ARM: 7626/1: arm/crypto: Make asm SHA-1 and AES code Thumb-2" has been added to staging queue
Kamal Mostafa
kamal at canonical.com
Thu May 30 20:34:53 UTC 2013
This is a note to let you know that I have just added a patch titled
ARM: 7626/1: arm/crypto: Make asm SHA-1 and AES code Thumb-2
to the linux-3.8.y-queue branch of the 3.8.y.z extended stable tree
which can be found at:
http://kernel.ubuntu.com/git?p=ubuntu/linux.git;a=shortlog;h=refs/heads/linux-3.8.y-queue
This patch is scheduled to be released in version 3.8.13.2.
If you, or anyone else, feels it should not be added to this tree, please
reply to this email.
For more information about the 3.8.y.z tree, see
https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable
Thanks.
-Kamal
------
>From 6b068c099954ca74b19aacda8fedc9e3f26687dc Mon Sep 17 00:00:00 2001
From: Dave Martin <dave.martin at linaro.org>
Date: Thu, 10 Jan 2013 12:20:15 +0100
Subject: ARM: 7626/1: arm/crypto: Make asm SHA-1 and AES code Thumb-2
compatible
commit 638591cd7b601d403ed703d55062b48c32ea8cfb upstream.
This patch fixes aes-armv4.S and sha1-armv4-large.S to work
natively in Thumb. This allows ARM/Thumb interworking workarounds
to be removed.
I also take the opportunity to convert some explicit assembler
directives for exported functions to the standard
ENTRY()/ENDPROC().
For the code itself:
* In sha1_block_data_order, use of TEQ with sp is deprecated in
ARMv7 and not supported in Thumb. For the branches back to
.L_00_15 and .L_40_59, the TEQ is converted to a CMP, under the
assumption that clobbering the C flag here will not cause
incorrect behaviour.
For the first branch back to .L_20_39_or_60_79 the C flag is
important, so sp is moved temporarily into another register so
that TEQ can be used for the comparison.
* In the AES code, most forms of register-indexed addressing with
shifts and rotates are not permitted for loads and stores in
Thumb, so the address calculation is done using a separate
instruction for the Thumb case.
The resulting code is unlikely to be optimally scheduled, but it
should not have a large impact given the overall size of the code.
I haven't run any benchmarks.
Signed-off-by: Dave Martin <dave.martin at linaro.org>
Tested-by: David McCullough <ucdevel at gmail.com> (ARM only)
Acked-by: David McCullough <ucdevel at gmail.com>
Acked-by: Nicolas Pitre <nico at linaro.org>
Signed-off-by: Russell King <rmk+kernel at arm.linux.org.uk>
Signed-off-by: Kamal Mostafa <kamal at canonical.com>
---
arch/arm/crypto/aes-armv4.S | 64 ++++++++++++--------------------------
arch/arm/crypto/sha1-armv4-large.S | 24 ++++++--------
2 files changed, 29 insertions(+), 59 deletions(-)
diff --git a/arch/arm/crypto/aes-armv4.S b/arch/arm/crypto/aes-armv4.S
index e59b1d5..19d6cd6 100644
--- a/arch/arm/crypto/aes-armv4.S
+++ b/arch/arm/crypto/aes-armv4.S
@@ -34,8 +34,9 @@
@ A little glue here to select the correct code below for the ARM CPU
@ that is being targetted.
+#include <linux/linkage.h>
+
.text
-.code 32
.type AES_Te,%object
.align 5
@@ -145,10 +146,8 @@ AES_Te:
@ void AES_encrypt(const unsigned char *in, unsigned char *out,
@ const AES_KEY *key) {
-.global AES_encrypt
-.type AES_encrypt,%function
.align 5
-AES_encrypt:
+ENTRY(AES_encrypt)
sub r3,pc,#8 @ AES_encrypt
stmdb sp!,{r1,r4-r12,lr}
mov r12,r0 @ inp
@@ -239,15 +238,8 @@ AES_encrypt:
strb r6,[r12,#14]
strb r3,[r12,#15]
#endif
-#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size AES_encrypt,.-AES_encrypt
+ENDPROC(AES_encrypt)
.type _armv4_AES_encrypt,%function
.align 2
@@ -386,10 +378,8 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
-.global private_AES_set_encrypt_key
-.type private_AES_set_encrypt_key,%function
.align 5
-private_AES_set_encrypt_key:
+ENTRY(private_AES_set_encrypt_key)
_armv4_AES_set_encrypt_key:
sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0
@@ -658,15 +648,11 @@ _armv4_AES_set_encrypt_key:
.Ldone: mov r0,#0
ldmia sp!,{r4-r12,lr}
-.Labrt: tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+.Labrt: mov pc,lr
+ENDPROC(private_AES_set_encrypt_key)
-.global private_AES_set_decrypt_key
-.type private_AES_set_decrypt_key,%function
.align 5
-private_AES_set_decrypt_key:
+ENTRY(private_AES_set_decrypt_key)
str lr,[sp,#-4]! @ push lr
#if 0
@ kernel does both of these in setkey so optimise this bit out by
@@ -748,15 +734,8 @@ private_AES_set_decrypt_key:
bne .Lmix
mov r0,#0
-#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+ENDPROC(private_AES_set_decrypt_key)
.type AES_Td,%object
.align 5
@@ -862,10 +841,8 @@ AES_Td:
@ void AES_decrypt(const unsigned char *in, unsigned char *out,
@ const AES_KEY *key) {
-.global AES_decrypt
-.type AES_decrypt,%function
.align 5
-AES_decrypt:
+ENTRY(AES_decrypt)
sub r3,pc,#8 @ AES_decrypt
stmdb sp!,{r1,r4-r12,lr}
mov r12,r0 @ inp
@@ -956,15 +933,8 @@ AES_decrypt:
strb r6,[r12,#14]
strb r3,[r12,#15]
#endif
-#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size AES_decrypt,.-AES_decrypt
+ENDPROC(AES_decrypt)
.type _armv4_AES_decrypt,%function
.align 2
@@ -1064,7 +1034,9 @@ _armv4_AES_decrypt:
and r9,lr,r1,lsr#8
ldrb r7,[r10,r7] @ Td4[s1>>0]
- ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24]
+ ARM( ldrb r1,[r10,r1,lsr#24] ) @ Td4[s1>>24]
+ THUMB( add r1,r10,r1,lsr#24 ) @ Td4[s1>>24]
+ THUMB( ldrb r1,[r1] )
ldrb r8,[r10,r8] @ Td4[s1>>16]
eor r0,r7,r0,lsl#24
ldrb r9,[r10,r9] @ Td4[s1>>8]
@@ -1077,7 +1049,9 @@ _armv4_AES_decrypt:
ldrb r8,[r10,r8] @ Td4[s2>>0]
and r9,lr,r2,lsr#16
- ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24]
+ ARM( ldrb r2,[r10,r2,lsr#24] ) @ Td4[s2>>24]
+ THUMB( add r2,r10,r2,lsr#24 ) @ Td4[s2>>24]
+ THUMB( ldrb r2,[r2] )
eor r0,r0,r7,lsl#8
ldrb r9,[r10,r9] @ Td4[s2>>16]
eor r1,r8,r1,lsl#16
@@ -1090,7 +1064,9 @@ _armv4_AES_decrypt:
and r9,lr,r3 @ i2
ldrb r9,[r10,r9] @ Td4[s3>>0]
- ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24]
+ ARM( ldrb r3,[r10,r3,lsr#24] ) @ Td4[s3>>24]
+ THUMB( add r3,r10,r3,lsr#24 ) @ Td4[s3>>24]
+ THUMB( ldrb r3,[r3] )
eor r0,r0,r7,lsl#16
ldr r7,[r11,#0]
eor r1,r1,r8,lsl#8
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
index 7050ab1..92c6eed 100644
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ b/arch/arm/crypto/sha1-armv4-large.S
@@ -51,13 +51,12 @@
@ Profiler-assisted and platform-specific optimization resulted in 10%
@ improvement on Cortex A8 core and 12.2 cycles per byte.
-.text
+#include <linux/linkage.h>
-.global sha1_block_data_order
-.type sha1_block_data_order,%function
+.text
.align 2
-sha1_block_data_order:
+ENTRY(sha1_block_data_order)
stmdb sp!,{r4-r12,lr}
add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
ldmia r0,{r3,r4,r5,r6,r7}
@@ -194,7 +193,7 @@ sha1_block_data_order:
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r3,r3,r10 @ E+=F_00_19(B,C,D)
- teq r14,sp
+ cmp r14,sp
bne .L_00_15 @ [((11+4)*5+2)*3]
#if __ARM_ARCH__<7
ldrb r10,[r1,#2]
@@ -374,7 +373,9 @@ sha1_block_data_order:
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_20_39(B,C,D)
- teq r14,sp @ preserve carry
+ ARM( teq r14,sp ) @ preserve carry
+ THUMB( mov r11,sp )
+ THUMB( teq r14,r11 ) @ preserve carry
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
@@ -466,7 +467,7 @@ sha1_block_data_order:
add r3,r3,r9 @ E+=X[i]
add r3,r3,r10 @ E+=F_40_59(B,C,D)
add r3,r3,r11,ror#2
- teq r14,sp
+ cmp r14,sp
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr r8,.LK_60_79
@@ -485,19 +486,12 @@ sha1_block_data_order:
teq r1,r2
bne .Lloop @ [+18], total 1307
-#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
-.size sha1_block_data_order,.-sha1_block_data_order
+ENDPROC(sha1_block_data_order)
.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro at openssl.org>"
.align 2
--
1.8.1.2
More information about the kernel-team
mailing list