Android NDK之使用 arm-v7a 彙編實現兩數之和
關鍵詞: NDK armv7a WebRTC arm彙編 CMake
最近適配對講程式,在webrtc的庫編譯的過程中,發現其為arm的平臺定製了彙編程式以最佳化平方根倒數演算法
速度,上次寫彙編還是8086的,藉此機會初步嘗試下android上arm彙編
具體jni工程建立就不介紹了,Android Studio直接可以從模板建立
工程目錄如下
kryo@WSL1:/mnt/k/Android/NDK-Project/XXX/src/main$ tree
.
├── AndroidManifest.xml
├── cpp
│ ├── asm
│ │ ├── CMakeLists.txt
│ │ ├── asm_defines.h
│ │ ├── asm_jni.cpp
│ │ ├── asm_jni.h
│ │ ├── tow_sum_armv7a.S
│ │ └── tow_sum_cpp.cpp
└── java
└── com
└── kryo
├── asm
│ └── TowSumAsm.java
└── ...
1、C++介面編寫
asm_jni.h
#ifndef TOW_SUM_AMS_TEST_H
#define TOW_SUM_AMS_TEST_H
#include <jni.h>
#ifdef USE_ASM
extern "C" int32_t
tow_sum_asm(int32_t *data_in, int32_t *data_out, int32_t data_len, int32_t ret_len, int32_t target);
#else
extern "C" int32_t
tow_sum_cpp(int32_t *data_in, int32_t *data_out, int32_t data_len, int32_t target);
#endif
#endif //TOW_SUM_AMS_TEST_H
這裡分別使用asm和c程式碼各自實現一個暴搜版本的兩數之和介面。關於asm傳遞5個引數是有用意的,涉及到函式呼叫約定,armv7a前4個引數用暫存器傳參,超過4個的用棧傳遞
2、彙編實現
寫彙編時我習慣先參考C程式碼去推導
tow_sum_cpp.cpp
#include "asm_jni.h"
extern "C" int32_t tow_sum_cpp(int32_t *data_in, int32_t *data_out, int32_t data_len,int32_t target) {
for (int i = 0; i < data_len; ++i) {
for (int j = i + 1; j < data_len; ++j) {
if (data_in[i] + data_in[j] == target) {
data_out[0] = i;
data_out[1] = j;
return 0;
}
}
}
data_out[0] = 0;
data_out[1] = 0;
return -1;
}
以下是具體彙編程式碼的實現,基本每行都給出了註釋
tow_sum_armv7a.S
@ Input:(
@ int32_t* data_in, -> r0 &data_in
@ int32_t* data_out,-> r1 &data_out
@ int32_t data_len, -> r2
@ int32_t ret_len, -> r3
@ int32_t target -> [sp])
@ Output: r0 32 bit unsigned integer
@
@ r4: i-index
@ r5: j-index
@ r6: target
@ r7: num1-buff
@ r8: num2-buff
@ r9: sum cache
#include "asm_defines.h"
GLOBAL_FUNCTION tow_sum
.align 4
DEFINE_FUNCTION tow_sum
push {r4-r11} @ 儲存現場
ldr r6, [sp, #32] @ 儲存了8個暫存器,偏移8*4bytes取得第5個引數
mov r4, #0 @ 初始化第一個數的索引 i
mov r5, #0 @ 初始化第二個數的索引 j
LOOP_1:
sub r9, r2, #1 @ 陣列長度-1
cmp r4, r9 @ 判斷i是否陣列最後一個
beq FAL @ 是就查詢失敗
mov r5, r4 @ j = i
LOOP_2:
add r5, r5, #1 @ j ++
lsl r9, r4, #2 @ 把索引 i 乘4得到地址偏移量
ldr r7, [r0, r9] @ r7 = data_in[i],暫存器相對定址, r0為 data_in的地址,加上偏移量取的陣列元素
lsl r9, r5, #2
ldr r8, [r0, r9] @ 同上得到 r8 = data_in[j]
add r9, r8, r7 @ 兩數之和
cmp r9, r6 @ 與目標做比較
beq SUC @ 成功
add r9, r5, #1 @ 沒有成功
cmp r9, r2 @ if j < data_len
bne LOOP_2 @ then:下一輪j的查詢
add r4, r4, #1 @ else: j沒找到,把i++
b LOOP_1 @ 下一輪 i的查詢
SUC:
str r4, [r1] @ data_out[0] = i
str r5, [r1, #4] @ data_out[1] = j
mov r0, #0 @ return 0
b END
FAL:
mov r4, #0
mov r5, #0
mov r0, #-1 @ return -1
b SUC
END:
pop {r4-r11} @ 還原現場
bx lr
3、JNI實現
asm_jni.cpp
#include "asm_jni.h"
#include <android/log.h>
#define TAG "ASM_TEST"
#define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, TAG, __VA_ARGS__)
#ifdef __cplusplus
extern "C" {
#endif
JNIEXPORT jintArray JNICALL
Java_com_kryo_asm_TowSumAsm_towsum(JNIEnv *env, jobject thiz, jintArray data, jint target) {
jintArray r_array = env->NewIntArray(2);
jint *elements_out = env->GetIntArrayElements(r_array, NULL);
jsize length = env->GetArrayLength(data);
jint *elements_in = env->GetIntArrayElements(data, NULL);
#ifdef USE_ASM
LOGD("call tow_sum_asm !\n");
tow_sum_asm(elements_in, elements_out, (size_t) length, 2, (size_t) target);
#else
LOGD("call tow_sum_cpp !\n");
tow_sum_cpp(elements_in, elements_out, (size_t) length, (size_t) target);
#endif
env->ReleaseIntArrayElements(data, elements_in, 0);
env->ReleaseIntArrayElements(r_array, elements_out, 0);
return r_array;
}
#ifdef __cplusplus
}
#endif
TowSumAsm.java
public class TowSumAsm {
static {
System.loadLibrary("asm");
}
public native int[] towsum(int[] data, int target);
}
最後貼一下從webrtc開原始碼中copy來的asm_defines.h
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef KRYO_INCLUDE_ASM_DEFINES_H_
#define KRYO_INCLUDE_ASM_DEFINES_H_
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
// Define the macros used in ARM assembly code, so that for Mac or iOS builds
// we add leading underscores for the function names.
#ifdef __APPLE__
.macro GLOBAL_FUNCTION name
.global _\name
.private_extern _\name
.endm
.macro DEFINE_FUNCTION name
_\name:
.endm
.macro CALL_FUNCTION name
bl _\name
.endm
.macro GLOBAL_LABEL name
.global _\name
.private_extern _\name
.endm
#else
.macro GLOBAL_FUNCTION name
.global \name
.hidden \name
.endm
.macro DEFINE_FUNCTION name
#if defined(__linux__) && defined(__ELF__)
.type \name,%function
#endif
\name:
.endm
.macro CALL_FUNCTION name
bl \name
.endm
.macro GLOBAL_LABEL name
.global \name
.hidden \name
.endm
#endif
// With Apple's clang compiler, for instructions ldrb, strh, etc.,
// the condition code is after the width specifier. Here we define
// only the ones that are actually used in the assembly files.
#if (defined __llvm__) && (defined __APPLE__)
.macro streqh reg1, reg2, num
strheq \reg1, \reg2, \num
.endm
#endif
.text
#endif // KRYO_INCLUDE_ASM_DEFINES_H_
4、CMakeLists.txt編寫生成libasm.so
CMakeLists.txt
cmake_minimum_required(VERSION 3.10.2)
project("asm")
ENABLE_LANGUAGE(ASM) #啟用匯編支援
if(${ANDROID_ABI} STREQUAL "armeabi-v7a")
add_library(asm SHARED
asm_jni.cpp
tow_sum_armv7a.S)
add_definitions(-DUSE_ASM)
elseif(${ANDROID_ABI} STREQUAL "arm64-v8a")
add_library(asm SHARED
asm_jni.cpp
tow_sum_cpp.cpp)
else()
message(FATAL_ERROR "Unsupported ABI: ${ANDROID_ABI}")
endif()
target_link_libraries(asm
log)
5、執行測試
TowSumAsm towSumAsm = new TowSumAsm();
int[] result = towSumAsm.towsum(new int[]{1, 3, 5, 7, 9}, 12);
Log.d(TAG, "result " + result[0] + " " + result[1]);
2024-04-05 10:24:29.269 19863-19863 ASM_TEST com.kryo.demo D call tow_sum_asm !
2024-04-05 10:24:29.269 19863-19863 JNI_Activity com.kryo.demo D result 1 4
Reference
- VS Code 上ARM指令集參考文件外掛: 32位Code4Leg ,64位ARM A64 Instruction Reference
- Android-Audio-Processing-Using-WebRTC spl_sqrt_floor_arm.S