参考这个帖子,我想编写一个将android 888转换为nv21的方法。虽然来自camera2 API的映像是默认的NV21,但它需要一个更通用的实现。其内容如下:
class NV21Image{
public byte[] y;
public byte[] uv;
}
public static void cvtYUV420ToNV21(Image image, NV21Image nv21) {
int width = image.getWidth();
int height = image.getHeight();
int ySize = width*height;
ByteBuffer yBuffer = image.getPlanes()[0].getBuffer(); // Y
ByteBuffer uBuffer = image.getPlanes()[1].getBuffer(); // U
ByteBuffer vBuffer = image.getPlanes()[2].getBuffer(); // V
int yRowStride = image.getPlanes()[0].getRowStride();
int vRowStride = image.getPlanes()[2].getRowStride();
int pixelStride = image.getPlanes()[2].getPixelStride();
assert(image.getPlanes()[0].getPixelStride() == 1);
assert(image.getPlanes()[2].getRowStride() == image.getPlanes()[1].getRowStride());
assert(image.getPlanes()[2].getPixelStride() == image.getPlanes()[1].getPixelStride());
int pos = 0;
int yBufferPos = -yRowStride; // not an actual position
for (; pos<ySize; pos+=width) {
yBufferPos += yRowStride;
yBuffer.position(yBufferPos);
yBuffer.get(nv21.y, pos, width);
}
pos = 0;
for (int row=0; row<height/2; row++) {
for (int col=0; col<vRowStride / pixelStride; col++) {
int vuPos = col*pixelStride + row * vRowStride;
nv21.uv[pos++] = vBuffer.get(vuPos);
nv21.uv[pos++] = uBuffer.get(vuPos);
}
}
}上面的代码可以正常工作,而我的实时摄像头预览应用程序(在Snap巨龙865 CPU中,每帧720 p大约有12 my )可以正常工作,所以我尝试用JNI实现来加速它,以便从字节访问和性能优势中获利:
JNIEXPORT void JNICALL
Java_com_example_Utils_nFillYUVArray(JNIEnv *env, jclass clazz, jbyteArray yArr, jbyteArray uvArr,
jobject yBuf, jobject uBuf, jobject vBuf,
jint yRowStride, jint vRowStride, jint vPixelStride, jint w, jint h) {
auto ySrcPtr = (jbyte const*)env->GetDirectBufferAddress(yBuf);
auto uSrcPtr = (jbyte const*)env->GetDirectBufferAddress(uBuf);
auto vSrcPtr = (jbyte const*)env->GetDirectBufferAddress(vBuf);
for(int row = 0; row < h; row++){
env->SetByteArrayRegion(yArr, row * w, w, ySrcPtr + row * yRowStride);
}
int pos = 0;
for (int row=0; row<h/2; row++) {
for (int col=0; col<w/2; col++) {
int vuPos = col * vPixelStride + row * vRowStride;
env->SetByteArrayRegion(uvArr, pos++, 1, vSrcPtr + vuPos);
env->SetByteArrayRegion(uvArr, pos++, 1, uSrcPtr + vuPos);
}
}
}然而,情况比我预期的更糟(每帧大约107毫秒)。和最耗时的部分是用于UV缓冲器的隔行内存复制。
所以,我的问题是,是否有加速的方法,以及如何解决?
更新
我成功地加速了它(检查我的回答),当U,V平面的pixelStrides都是1或2时,我相信在大多数情况下都是这样的。
发布于 2022-12-03 04:45:16
正如@snachmsm所说,libyuv可能会有所帮助。我找到了一个可用的API I420ToNV21,但它不能接收pixelStride参数,因为YUV_420_888并不保证U,V平面中相邻像素之间不存在空白。
当pixelStride为2(减少到每帧 2.7ms )时,我成功地使用arm本质加速了它:
JNIEXPORT void JNICALL
Java_com_example_Utils_nFillYUVArray(JNIEnv *env, jclass clazz, jbyteArray yArr, jbyteArray uvArr,
jobject yBuf, jobject uBuf, jobject vBuf,
jint yRowStride, jint vRowStride, jint uRowStride, jint pixelStride,
jint width, jint height) {
///TODO: too time-consuming
auto ySrcPtr = (jbyte const*)env->GetDirectBufferAddress(yBuf);
auto uSrcPtr = (jbyte const*)env->GetDirectBufferAddress(uBuf);
auto vSrcPtr = (jbyte const*)env->GetDirectBufferAddress(vBuf);
for(int row = 0; row < height; row++){
env->SetByteArrayRegion(yArr, row * width, width, ySrcPtr + row * yRowStride);
}
constexpr int kStride = 8;
const size_t nGroups = width / kStride;
if(pixelStride == 2){
int8_t *line = (int8_t*)alignedAlloc(width, 64);
int8_t *mask = (int8_t*)alignedAlloc(kStride, 64);
memset(mask, 0, kStride);
for(int i=0; i < kStride / 2; i++) {
mask[i * 2] = -1;
}
int8x8_t vm = vld1_s8(mask);
for(int row = 0; row < height / 2; row ++){
size_t vrowOff = row * vRowStride;
size_t urowOff = row * uRowStride;
for(int g = 0; g < nGroups; g++) {
size_t colOff = g * kStride;
int8x8_t v0 = vld1_s8(vSrcPtr + vrowOff + colOff);
int8x8_t v1 = vld1_s8(uSrcPtr + urowOff + colOff);
int8x8_t a0 = vand_s8(v0, vm);
int16x4_t b1 = vreinterpret_s16_s8(vand_s8(v1, vm));
int8x8_t a1 = vreinterpret_s8_s16(vshl_n_s16(b1, 8));
int8x8_t r = vorr_s8(a0, a1);
vst1_s8(line + colOff, r);
}
env->SetByteArrayRegion(uvArr, row * width, width, line);
}
free(mask);
free(line);
}else if(pixelStride == 1){
int8_t *line = (int8_t*)alignedAlloc(width, 64);
for(int row = 0; row < height / 2; row ++) {
size_t vrowOff = row * vRowStride;
size_t urowOff = row * uRowStride;
for(int g = 0; g < nGroups / 2; g++){
size_t colOff = g * kStride;
int8x8_t a0 = vld1_s8(vSrcPtr + vrowOff + colOff);
int8x16_t b0 = vreinterpretq_s8_s16(vmovl_s8(a0));
int8x8_t b01 = vget_high_s8(b0);
int8x8_t b00 = vget_low_s8(b0);
int8x8_t a1 = vld1_s8(uSrcPtr + urowOff + colOff);
int16x8_t c1 = vmovl_s8(a1);
int16x4_t c11 = vget_high_s16(c1);
int16x4_t c10 = vget_low_s16(c1);
int8x8_t b11 = vreinterpret_s8_s16(vshl_n_s16(c11, 8));
int8x8_t b10 = vreinterpret_s8_s16(vshl_n_s16(c10, 8));
a1 = vorr_s8(b11, b01);
a0 = vorr_s8(b10, b00);
vst1_s8(line + colOff, a0);
vst1_s8(line + colOff + kStride, a1);
}
env->SetByteArrayRegion(uvArr, row * width, width, line);
}
free(line);
}
}pixelStride == 1 的案例没有经过足够的测试,但我相信它会像预期的那样工作。
https://stackoverflow.com/questions/74653288
复制相似问题