Don't use LUT for YUV conversion.

Current C version is just as fast, and doesn't thrash the cache.
~100us per 640x480 webcam frame.
This commit is contained in:
Themaister 2013-11-20 15:31:56 +01:00
parent 9d60b53630
commit c4176564c5
5 changed files with 96 additions and 136 deletions

View file

@ -19,9 +19,11 @@
#include <malloc.h>
#include <string.h>
#include <assert.h>
#include <stddef.h>
#include "../driver.h"
#include "../performance.h"
#include "../miscellaneous.h"
#include "../gfx/scaler/scaler.h"
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
@ -49,102 +51,21 @@ typedef struct video4linux
unsigned n_buffers;
size_t width;
size_t height;
size_t pitch;
uint32_t *YCbCr_to_RGB;
struct scaler_ctx scaler;
uint32_t *buffer_output;
bool ready;
char dev_name[PATH_MAX];
} video4linux_t;
// FIXME: Shouldn't use LUTs for this.
// The LUT is simply too big, and the conversion can be done efficiently with fixed-point SIMD anyways.
/*
* YCbCr to RGB lookup table
* Y, Cb, Cr range is 0-255
*
* Stored value bits:
* 24-16 Red
* 15-8 Green
* 7-0 Blue
*/
#define YUV_SHIFT(y, cb, cr) ((y << 16) | (cb << 8) | (cr << 0))
#define RGB_SHIFT(r, g, b) ((r << 16) | (g << 8) | (b << 0))
static uint32_t *generate_YCbCr_to_RGB_lookup(void)
static void process_image(video4linux_t *v4l, const uint8_t *buffer_yuv)
{
int y;
int cb;
int cr;
uint32_t *buffer = (uint32_t*)malloc(256 * 256 * 256 * sizeof(uint32_t));
if (!buffer)
return NULL;
for (y = 0; y < 256; y++)
{
for (cb = 0; cb < 256; cb++)
{
for (cr = 0; cr < 256; cr++)
{
double Y = (double)y;
double Cb = (double)cb;
double Cr = (double)cr;
int R = (int)(Y + 1.40200 * (Cr - 0x80));
int G = (int)(Y - 0.34414 * (Cb - 0x80) - 0.71414 * (Cr - 0x80));
int B = (int)(Y + 1.77200 * (Cb - 0x80));
R = max(0, min(255, R));
G = max(0, min(255, G));
B = max(0, min(255, B));
buffer[YUV_SHIFT(y, cb, cr)] = RGB_SHIFT(R, G, B);
}
}
}
return buffer;
}
/**
* Converts YUV422 to RGB
* Before first use call generate_YCbCr_to_RGB_lookup();
*
* input is pointer to YUV422 encoded data in following order: Y0, Cb, Y1, Cr.
* output is pointer to 24 bit RGB buffer.
* Output data is written in following order: R1, G1, B1, R2, G2, B2.
*/
// FIXME: Software CPU color conersion from YUV to RGB - we'll make two codepaths
// eventually - GL binding to texture and color conversion through shaders,
// and this approach
static inline void YUV422_to_RGB(uint32_t *output, const uint8_t *input, const uint32_t *lut)
{
uint8_t y0 = input[0];
uint8_t cb = input[1];
uint8_t y1 = input[2];
uint8_t cr = input[3];
output[0] = lut[YUV_SHIFT(y0, cb, cr)];
output[1] = lut[YUV_SHIFT(y1, cb, cr)];
}
static void process_image(void *data, const uint8_t *buffer_yuv)
{
RARCH_PERFORMANCE_INIT(yuv_convert);
RARCH_PERFORMANCE_START(yuv_convert);
video4linux_t *v4l = (video4linux_t*)data;
const uint32_t *lut = v4l->YCbCr_to_RGB;
uint32_t *dst = v4l->buffer_output;
size_t x, y;
for (y = 0; y < v4l->height; y++, dst += v4l->width, buffer_yuv += v4l->width * 2)
for (x = 0; x < v4l->width; x += 2)
YUV422_to_RGB(dst + x, buffer_yuv + x * 2, lut);
RARCH_PERFORMANCE_STOP(yuv_convert);
RARCH_PERFORMANCE_INIT(yuv_convert_direct);
RARCH_PERFORMANCE_START(yuv_convert_direct);
scaler_ctx_scale(&v4l->scaler, v4l->buffer_output, buffer_yuv);
RARCH_PERFORMANCE_STOP(yuv_convert_direct);
}
static int xioctl(int fd, int request, void *args)
@ -239,7 +160,7 @@ static bool init_device(void *data)
unsigned min;
video4linux_t *v4l = (video4linux_t*)data;
if (xioctl(v4l->fd, VIDIOC_QUERYCAP, &cap) == -1)
if (xioctl(v4l->fd, VIDIOC_QUERYCAP, &cap) < 0)
{
if (errno == EINVAL)
{
@ -265,29 +186,15 @@ static bool init_device(void *data)
return false;
}
/* Select video input, video standard and tune here. */
memset(&cropcap, 0, sizeof(cropcap));
cropcap.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (xioctl(v4l->fd, VIDIOC_CROPCAP, &cropcap) == 0)
{
crop.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
crop.c = cropcap.defrect; /* reset to default */
if (xioctl(v4l->fd, VIDIOC_S_CROP, &crop) == -1)
{
switch (errno)
{
case EINVAL:
/* Cropping not supported. */
break;
default:
/* Errors ignored. */
break;
}
}
crop.c = cropcap.defrect;
// Ignore errors here.
xioctl(v4l->fd, VIDIOC_S_CROP, &crop);
}
memset(&fmt, 0, sizeof(fmt));
@ -295,41 +202,42 @@ static bool init_device(void *data)
fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
fmt.fmt.pix.width = v4l->width;
fmt.fmt.pix.height = v4l->height;
// TODO: See if we can use a saner format here.
fmt.fmt.pix.pixelformat = V4L2_PIX_FMT_YUYV;
fmt.fmt.pix.field = V4L2_FIELD_INTERLACED;
fmt.fmt.pix.field = V4L2_FIELD_NONE;
if (xioctl(v4l->fd, VIDIOC_S_FMT, &fmt) == -1)
if (xioctl(v4l->fd, VIDIOC_S_FMT, &fmt) < 0)
{
RARCH_ERR("Error - VIDIOC_S_FMT\n");
return false;
}
/* Note VIDIOC_S_FMT may change width and height. */
// VIDIOC_S_FMT may change width, height and pitch.
v4l->width = fmt.fmt.pix.width;
v4l->height = fmt.fmt.pix.height;
v4l->pitch = max(fmt.fmt.pix.bytesperline, v4l->width * 2);
/* Buggy driver paranoia. */
min = fmt.fmt.pix.width * 2;
if (fmt.fmt.pix.bytesperline < min)
fmt.fmt.pix.bytesperline = min;
min = fmt.fmt.pix.bytesperline * fmt.fmt.pix.height;
if (fmt.fmt.pix.sizeimage < min)
fmt.fmt.pix.sizeimage = min;
// Sanity check to see if our assumptions are met.
// It is possible to support whatever the device gives us,
// but this dramatically increases complexity.
if (fmt.fmt.pix.pixelformat != V4L2_PIX_FMT_YUYV)
{
RARCH_ERR("The V4L2 device doesn't support YUYV.\n");
return false;
}
if (fmt.fmt.pix.width != v4l->width)
v4l->width = fmt.fmt.pix.width;
if (fmt.fmt.pix.height != v4l->height)
v4l->height = fmt.fmt.pix.height;
if (fmt.fmt.pix.field != V4L2_FIELD_NONE && fmt.fmt.pix.field != V4L2_FIELD_INTERLACED)
{
RARCH_ERR("The V4L2 device doesn't support progressive nor interlaced video.\n");
return false;
}
return init_mmap(v4l);
}
static void v4l_stop(void *data)
{
enum v4l2_buf_type type;
video4linux_t *v4l = (video4linux_t*)data;
type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (xioctl(v4l->fd, VIDIOC_STREAMOFF, &type) == -1)
RARCH_ERR("Error - VIDIOC_STREAMOFF.\n");
@ -385,9 +293,8 @@ static void v4l_free(void *data)
if (v4l->fd >= 0)
close(v4l->fd);
free(v4l->YCbCr_to_RGB);
free(v4l->buffer_output);
scaler_ctx_gen_reset(&v4l->scaler);
free(v4l);
}
@ -441,10 +348,16 @@ static void *v4l_init(const char *device, uint64_t caps, unsigned width, unsigne
goto error;
}
v4l->YCbCr_to_RGB = generate_YCbCr_to_RGB_lookup();
if (!v4l->YCbCr_to_RGB)
v4l->scaler.in_width = v4l->scaler.out_width = v4l->width;
v4l->scaler.in_height = v4l->scaler.out_height = v4l->height;
v4l->scaler.in_fmt = SCALER_FMT_YUYV;
v4l->scaler.out_fmt = SCALER_FMT_ARGB8888;
v4l->scaler.in_stride = v4l->pitch;
v4l->scaler.out_stride = v4l->width * 4;
if (!scaler_ctx_gen_filter(&v4l->scaler))
{
RARCH_ERR("Failed to create YUV->RGB LUT.\n");
RARCH_ERR("Failed to create scaler.\n");
goto error;
}
@ -473,11 +386,6 @@ static bool preprocess_image(void *data)
{
case EAGAIN:
return false;
case EIO:
/* Could ignore EIO, see spec. */
/* fall through */
default:
RARCH_ERR("VIDIOC_DQBUF.\n");
return false;
@ -521,3 +429,4 @@ const camera_driver_t camera_v4l2 = {
v4l_poll,
"video4linux2",
};

View file

@ -672,6 +672,50 @@ void conv_argb8888_abgr8888(void *output_, const void *input_,
}
}
static inline uint8_t clamp_8bit(int val)
{
if (val > 255)
return 255;
else if (val < 0)
return 0;
else
return val;
}
void conv_yuyv_argb8888(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)
{
int h, w;
const uint8_t *input = (const uint8_t*)input_;
uint32_t *output = (uint32_t*)output_;
for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
{
const uint8_t *src = input;
uint32_t *dst = output;
for (w = 0; w < width; w += 2, src += 4, dst += 2)
{
int y0 = src[0] - 16;
int u = src[1] - 128;
int y1 = src[2] - 16;
int v = src[3] - 128;
uint8_t r0 = clamp_8bit((298 * y0 + 409 * v + 128) >> 8);
uint8_t g0 = clamp_8bit((298 * y0 - 100 * u - 208 * v + 128) >> 8);
uint8_t b0 = clamp_8bit((298 * y0 + 516 * u + 128) >> 8);
uint8_t r1 = clamp_8bit((298 * y1 + 409 * v + 128) >> 8);
uint8_t g1 = clamp_8bit((298 * y1 - 100 * u - 208 * v + 128) >> 8);
uint8_t b1 = clamp_8bit((298 * y1 + 516 * u + 128) >> 8);
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
}
}
}
void conv_copy(void *output_, const void *input_,
int width, int height,
int out_stride, int in_stride)

View file

@ -60,6 +60,10 @@ void conv_rgb565_bgr24(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_yuyv_argb8888(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);
void conv_copy(void *output, const void *input,
int width, int height,
int out_stride, int in_stride);

View file

@ -86,6 +86,8 @@ static bool set_direct_pix_conv(struct scaler_ctx *ctx)
ctx->direct_pixconv = conv_0rgb1555_bgr24;
else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_ABGR8888)
ctx->direct_pixconv = conv_argb8888_abgr8888;
else if (ctx->in_fmt == SCALER_FMT_YUYV && ctx->out_fmt == SCALER_FMT_ARGB8888)
ctx->direct_pixconv = conv_yuyv_argb8888;
else
return false;

View file

@ -28,7 +28,8 @@ enum scaler_pix_fmt
SCALER_FMT_ABGR8888,
SCALER_FMT_0RGB1555,
SCALER_FMT_RGB565,
SCALER_FMT_BGR24
SCALER_FMT_BGR24,
SCALER_FMT_YUYV
};
enum scaler_type