/* This file is autogenerated by u_format_access.py from u_format.csv. Do not edit directly. */

/**************************************************************************
 *
 * Copyright 2009 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

/**
 * @file
 * Pixel format accessor functions.
 *
 * @author Jose Fonseca <jfonseca@vmware.com>
 */

#include "pipe/p_compiler.h"
#include "u_format.h"
#include "u_math.h"

static INLINE double
clamp(double value, double lbound, double ubound)
{
   if(value < lbound)
      return lbound;
   if(value > ubound)
      return ubound;
   return value;
}

static INLINE float
clampf(float value, float lbound, float ubound)
{
   if(value < lbound)
      return lbound;
   if(value > ubound)
      return ubound;
   return value;
}

static INLINE unsigned int
clampui(unsigned int value, unsigned int lbound, unsigned int ubound)
{
   if(value < lbound)
      return lbound;
   if(value > ubound)
      return ubound;
   return value;
}

static INLINE int
clampsi(int value, int lbound, int ubound)
{
   if(value < lbound)
      return lbound;
   if(value > ubound)
      return ubound;
   return value;
}

static void
util_format_a8r8g8b8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float b = (float)((pixel & 0xff) * (1.0f/0xff));
         float g = (float)(((pixel >> 8) & 0xff) * (1.0f/0xff));
         float r = (float)(((pixel >> 16) & 0xff) * (1.0f/0xff));
         float a = (float)((pixel >> 24) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_x8r8g8b8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float b = (float)((pixel & 0xff) * (1.0f/0xff));
         float g = (float)(((pixel >> 8) & 0xff) * (1.0f/0xff));
         float r = (float)(((pixel >> 16) & 0xff) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_b8g8r8a8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float a = (float)((pixel & 0xff) * (1.0f/0xff));
         float r = (float)(((pixel >> 8) & 0xff) * (1.0f/0xff));
         float g = (float)(((pixel >> 16) & 0xff) * (1.0f/0xff));
         float b = (float)((pixel >> 24) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_b8g8r8x8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float r = (float)(((pixel >> 8) & 0xff) * (1.0f/0xff));
         float g = (float)(((pixel >> 16) & 0xff) * (1.0f/0xff));
         float b = (float)((pixel >> 24) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_a1r5g5b5_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         float b = (float)((pixel & 0x1) * (1.0f/0x1));
         float g = (float)(((pixel >> 1) & 0x1f) * (1.0f/0x1f));
         float r = (float)(((pixel >> 6) & 0x1f) * (1.0f/0x1f));
         float a = (float)((pixel >> 11) * (1.0f/0x1f));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_a4r4g4b4_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         float b = (float)((pixel & 0xf) * (1.0f/0xf));
         float g = (float)(((pixel >> 4) & 0xf) * (1.0f/0xf));
         float r = (float)(((pixel >> 8) & 0xf) * (1.0f/0xf));
         float a = (float)((pixel >> 12) * (1.0f/0xf));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r5g6b5_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         float b = (float)((pixel & 0x1f) * (1.0f/0x1f));
         float g = (float)(((pixel >> 5) & 0x3f) * (1.0f/0x3f));
         float r = (float)((pixel >> 11) * (1.0f/0x1f));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_a2b10g10r10_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float r = (float)((pixel & 0x3ff) * (1.0f/0x3ff));
         float g = (float)(((pixel >> 10) & 0x3ff) * (1.0f/0x3ff));
         float b = (float)(((pixel >> 20) & 0x3ff) * (1.0f/0x3ff));
         float a = (float)((pixel >> 30) * (1.0f/0x3));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_l8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = *src_pixel++;
         float rgb = (float)(pixel * (1.0f/0xff));
         *dst_pixel++ = rgb; /* r */
         *dst_pixel++ = rgb; /* g */
         *dst_pixel++ = rgb; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_a8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = *src_pixel++;
         float a = (float)(pixel * (1.0f/0xff));
         *dst_pixel++ = 0; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_i8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = *src_pixel++;
         float rgba = (float)(pixel * (1.0f/0xff));
         *dst_pixel++ = rgba; /* r */
         *dst_pixel++ = rgba; /* g */
         *dst_pixel++ = rgba; /* b */
         *dst_pixel++ = rgba; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_a8l8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         float rgb = (float)((pixel & 0xff) * (1.0f/0xff));
         float a = (float)((pixel >> 8) * (1.0f/0xff));
         *dst_pixel++ = rgb; /* r */
         *dst_pixel++ = rgb; /* g */
         *dst_pixel++ = rgb; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_l16_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         float rgb = (float)(pixel * (1.0f/0xffff));
         *dst_pixel++ = rgb; /* r */
         *dst_pixel++ = rgb; /* g */
         *dst_pixel++ = rgb; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_z16_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float z = (float)((*src_pixel++) * (1.0f/0xffff));
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_z32_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float z = (float)((*src_pixel++) * (1.0/0xffffffff));
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_z32_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float z = (*src_pixel++);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_s8z24_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float z = (float)((pixel >> 8) * (1.0/0xffffff));
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_z24s8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float z = (float)((pixel & 0xffffff) * (1.0/0xffffff));
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_x8z24_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float z = (float)((pixel >> 8) * (1.0/0xffffff));
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_z24x8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         float z = (float)((pixel & 0xffffff) * (1.0/0xffffff));
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r64_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*8);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r64g64_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*16);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r64g64b64_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*24);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r64g64b64a64_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*32);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         float a = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*8);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (*src_pixel++);
         float g = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*12);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (*src_pixel++);
         float g = (*src_pixel++);
         float b = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32a32_float_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*16);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (*src_pixel++);
         float g = (*src_pixel++);
         float b = (*src_pixel++);
         float a = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0/0xffffffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*8);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0/0xffffffff));
         float g = (float)((*src_pixel++) * (1.0/0xffffffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*12);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0/0xffffffff));
         float g = (float)((*src_pixel++) * (1.0/0xffffffff));
         float b = (float)((*src_pixel++) * (1.0/0xffffffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32a32_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*16);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0/0xffffffff));
         float g = (float)((*src_pixel++) * (1.0/0xffffffff));
         float b = (float)((*src_pixel++) * (1.0/0xffffffff));
         float a = (float)((*src_pixel++) * (1.0/0xffffffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*8);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*12);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32a32_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*16);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         float a = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16g16_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xffff));
         float g = (float)((*src_pixel++) * (1.0f/0xffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xffff));
         float g = (float)((*src_pixel++) * (1.0f/0xffff));
         float b = (float)((*src_pixel++) * (1.0f/0xffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16a16_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xffff));
         float g = (float)((*src_pixel++) * (1.0f/0xffff));
         float b = (float)((*src_pixel++) * (1.0f/0xffff));
         float a = (float)((*src_pixel++) * (1.0f/0xffff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16g16_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16a16_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         float a = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xff));
         float g = (float)((*src_pixel++) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xff));
         float g = (float)((*src_pixel++) * (1.0f/0xff));
         float b = (float)((*src_pixel++) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8a8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xff));
         float g = (float)((*src_pixel++) * (1.0f/0xff));
         float b = (float)((*src_pixel++) * (1.0f/0xff));
         float a = (float)((*src_pixel++) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8x8_unorm_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)((*src_pixel++) * (1.0f/0xff));
         float g = (float)((*src_pixel++) * (1.0f/0xff));
         float b = (float)((*src_pixel++) * (1.0f/0xff));
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8a8_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         float a = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8x8_uscaled_read_4f(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   float *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      float *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         float r = (float)(*src_pixel++);
         float g = (float)(*src_pixel++);
         float b = (float)(*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(float);
   }
}

void
util_format_read_4f(enum pipe_format format, float *dst, unsigned dst_stride, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)
{
   void (*func)(float *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);
   switch(format) {
   case PIPE_FORMAT_A8R8G8B8_UNORM:
      func = &util_format_a8r8g8b8_unorm_read_4f;
      break;
   case PIPE_FORMAT_X8R8G8B8_UNORM:
      func = &util_format_x8r8g8b8_unorm_read_4f;
      break;
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      func = &util_format_b8g8r8a8_unorm_read_4f;
      break;
   case PIPE_FORMAT_B8G8R8X8_UNORM:
      func = &util_format_b8g8r8x8_unorm_read_4f;
      break;
   case PIPE_FORMAT_A1R5G5B5_UNORM:
      func = &util_format_a1r5g5b5_unorm_read_4f;
      break;
   case PIPE_FORMAT_A4R4G4B4_UNORM:
      func = &util_format_a4r4g4b4_unorm_read_4f;
      break;
   case PIPE_FORMAT_R5G6B5_UNORM:
      func = &util_format_r5g6b5_unorm_read_4f;
      break;
   case PIPE_FORMAT_A2B10G10R10_UNORM:
      func = &util_format_a2b10g10r10_unorm_read_4f;
      break;
   case PIPE_FORMAT_L8_UNORM:
      func = &util_format_l8_unorm_read_4f;
      break;
   case PIPE_FORMAT_A8_UNORM:
      func = &util_format_a8_unorm_read_4f;
      break;
   case PIPE_FORMAT_I8_UNORM:
      func = &util_format_i8_unorm_read_4f;
      break;
   case PIPE_FORMAT_A8L8_UNORM:
      func = &util_format_a8l8_unorm_read_4f;
      break;
   case PIPE_FORMAT_L16_UNORM:
      func = &util_format_l16_unorm_read_4f;
      break;
   case PIPE_FORMAT_Z16_UNORM:
      func = &util_format_z16_unorm_read_4f;
      break;
   case PIPE_FORMAT_Z32_UNORM:
      func = &util_format_z32_unorm_read_4f;
      break;
   case PIPE_FORMAT_Z32_FLOAT:
      func = &util_format_z32_float_read_4f;
      break;
   case PIPE_FORMAT_S8Z24_UNORM:
      func = &util_format_s8z24_unorm_read_4f;
      break;
   case PIPE_FORMAT_Z24S8_UNORM:
      func = &util_format_z24s8_unorm_read_4f;
      break;
   case PIPE_FORMAT_X8Z24_UNORM:
      func = &util_format_x8z24_unorm_read_4f;
      break;
   case PIPE_FORMAT_Z24X8_UNORM:
      func = &util_format_z24x8_unorm_read_4f;
      break;
   case PIPE_FORMAT_R64_FLOAT:
      func = &util_format_r64_float_read_4f;
      break;
   case PIPE_FORMAT_R64G64_FLOAT:
      func = &util_format_r64g64_float_read_4f;
      break;
   case PIPE_FORMAT_R64G64B64_FLOAT:
      func = &util_format_r64g64b64_float_read_4f;
      break;
   case PIPE_FORMAT_R64G64B64A64_FLOAT:
      func = &util_format_r64g64b64a64_float_read_4f;
      break;
   case PIPE_FORMAT_R32_FLOAT:
      func = &util_format_r32_float_read_4f;
      break;
   case PIPE_FORMAT_R32G32_FLOAT:
      func = &util_format_r32g32_float_read_4f;
      break;
   case PIPE_FORMAT_R32G32B32_FLOAT:
      func = &util_format_r32g32b32_float_read_4f;
      break;
   case PIPE_FORMAT_R32G32B32A32_FLOAT:
      func = &util_format_r32g32b32a32_float_read_4f;
      break;
   case PIPE_FORMAT_R32_UNORM:
      func = &util_format_r32_unorm_read_4f;
      break;
   case PIPE_FORMAT_R32G32_UNORM:
      func = &util_format_r32g32_unorm_read_4f;
      break;
   case PIPE_FORMAT_R32G32B32_UNORM:
      func = &util_format_r32g32b32_unorm_read_4f;
      break;
   case PIPE_FORMAT_R32G32B32A32_UNORM:
      func = &util_format_r32g32b32a32_unorm_read_4f;
      break;
   case PIPE_FORMAT_R32_USCALED:
      func = &util_format_r32_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R32G32_USCALED:
      func = &util_format_r32g32_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R32G32B32_USCALED:
      func = &util_format_r32g32b32_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R32G32B32A32_USCALED:
      func = &util_format_r32g32b32a32_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R16_UNORM:
      func = &util_format_r16_unorm_read_4f;
      break;
   case PIPE_FORMAT_R16G16_UNORM:
      func = &util_format_r16g16_unorm_read_4f;
      break;
   case PIPE_FORMAT_R16G16B16_UNORM:
      func = &util_format_r16g16b16_unorm_read_4f;
      break;
   case PIPE_FORMAT_R16G16B16A16_UNORM:
      func = &util_format_r16g16b16a16_unorm_read_4f;
      break;
   case PIPE_FORMAT_R16_USCALED:
      func = &util_format_r16_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R16G16_USCALED:
      func = &util_format_r16g16_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R16G16B16_USCALED:
      func = &util_format_r16g16b16_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R16G16B16A16_USCALED:
      func = &util_format_r16g16b16a16_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R8_UNORM:
      func = &util_format_r8_unorm_read_4f;
      break;
   case PIPE_FORMAT_R8G8_UNORM:
      func = &util_format_r8g8_unorm_read_4f;
      break;
   case PIPE_FORMAT_R8G8B8_UNORM:
      func = &util_format_r8g8b8_unorm_read_4f;
      break;
   case PIPE_FORMAT_R8G8B8A8_UNORM:
      func = &util_format_r8g8b8a8_unorm_read_4f;
      break;
   case PIPE_FORMAT_R8G8B8X8_UNORM:
      func = &util_format_r8g8b8x8_unorm_read_4f;
      break;
   case PIPE_FORMAT_R8_USCALED:
      func = &util_format_r8_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R8G8_USCALED:
      func = &util_format_r8g8_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R8G8B8_USCALED:
      func = &util_format_r8g8b8_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R8G8B8A8_USCALED:
      func = &util_format_r8g8b8a8_uscaled_read_4f;
      break;
   case PIPE_FORMAT_R8G8B8X8_USCALED:
      func = &util_format_r8g8b8x8_uscaled_read_4f;
      break;
   default:
      debug_printf("unsupported format\n");
      return;
   }
   func(dst, dst_stride, (const uint8_t *)src, src_stride, x, y, w, h);
}

static void
util_format_a8r8g8b8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         pixel |= ((uint32_t)(clampf(src_pixel[1], 0, 1) * 0xff) << 8);
         pixel |= ((uint32_t)(clampf(src_pixel[0], 0, 1) * 0xff) << 16);
         pixel |= ((uint32_t)(clampf(src_pixel[3], 0, 1) * 0xff) << 24);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_x8r8g8b8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         pixel |= ((uint32_t)(clampf(src_pixel[1], 0, 1) * 0xff) << 8);
         pixel |= ((uint32_t)(clampf(src_pixel[0], 0, 1) * 0xff) << 16);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_b8g8r8a8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(clampf(src_pixel[3], 0, 1) * 0xff);
         pixel |= ((uint32_t)(clampf(src_pixel[0], 0, 1) * 0xff) << 8);
         pixel |= ((uint32_t)(clampf(src_pixel[1], 0, 1) * 0xff) << 16);
         pixel |= ((uint32_t)(clampf(src_pixel[2], 0, 1) * 0xff) << 24);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_b8g8r8x8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= ((uint32_t)(clampf(src_pixel[0], 0, 1) * 0xff) << 8);
         pixel |= ((uint32_t)(clampf(src_pixel[1], 0, 1) * 0xff) << 16);
         pixel |= ((uint32_t)(clampf(src_pixel[2], 0, 1) * 0xff) << 24);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_a1r5g5b5_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(clampf(src_pixel[2], 0, 1) * 0x1);
         pixel |= ((uint16_t)(clampf(src_pixel[1], 0, 1) * 0x1f) << 1);
         pixel |= ((uint16_t)(clampf(src_pixel[0], 0, 1) * 0x1f) << 6);
         pixel |= ((uint16_t)(clampf(src_pixel[3], 0, 1) * 0x1f) << 11);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_a4r4g4b4_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(clampf(src_pixel[2], 0, 1) * 0xf);
         pixel |= ((uint16_t)(clampf(src_pixel[1], 0, 1) * 0xf) << 4);
         pixel |= ((uint16_t)(clampf(src_pixel[0], 0, 1) * 0xf) << 8);
         pixel |= ((uint16_t)(clampf(src_pixel[3], 0, 1) * 0xf) << 12);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r5g6b5_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(clampf(src_pixel[2], 0, 1) * 0x1f);
         pixel |= ((uint16_t)(clampf(src_pixel[1], 0, 1) * 0x3f) << 5);
         pixel |= ((uint16_t)(clampf(src_pixel[0], 0, 1) * 0x1f) << 11);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_a2b10g10r10_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(clampf(src_pixel[0], 0, 1) * 0x3ff);
         pixel |= ((uint32_t)(clampf(src_pixel[1], 0, 1) * 0x3ff) << 10);
         pixel |= ((uint32_t)(clampf(src_pixel[2], 0, 1) * 0x3ff) << 20);
         pixel |= ((uint32_t)(clampf(src_pixel[3], 0, 1) * 0x3) << 30);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_l8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = 0;
         pixel |= (uint8_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_a8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = 0;
         pixel |= (uint8_t)(clampf(src_pixel[3], 0, 1) * 0xff);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_i8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = 0;
         pixel |= (uint8_t)(clampf(src_pixel[3], 0, 1) * 0xff);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_a8l8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         pixel |= ((uint16_t)(clampf(src_pixel[3], 0, 1) * 0xff) << 8);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_l16_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(clampf(src_pixel[2], 0, 1) * 0xffff);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_z16_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[0], 0, 1) * 0xffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_z32_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_z32_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_s8z24_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= ((uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffff) << 8);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_z24s8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffff);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_x8z24_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= ((uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffff) << 8);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_z24x8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffff);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r64_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*8);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r64g64_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*16);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)src_pixel[0];
         *dst_pixel++ = (double)src_pixel[1];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r64g64b64_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*24);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)src_pixel[0];
         *dst_pixel++ = (double)src_pixel[1];
         *dst_pixel++ = (double)src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r64g64b64a64_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*32);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)src_pixel[0];
         *dst_pixel++ = (double)src_pixel[1];
         *dst_pixel++ = (double)src_pixel[2];
         *dst_pixel++ = (double)src_pixel[3];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*8);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*12);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         *dst_pixel++ = src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32a32_float_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*16);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         *dst_pixel++ = src_pixel[2];
         *dst_pixel++ = src_pixel[3];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*8);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffffff);
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[1], 0, 1) * (double)0xffffffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*12);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffffff);
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[1], 0, 1) * (double)0xffffffff);
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[2], 0, 1) * (double)0xffffffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32a32_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*16);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[0], 0, 1) * (double)0xffffffff);
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[1], 0, 1) * (double)0xffffffff);
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[2], 0, 1) * (double)0xffffffff);
         *dst_pixel++ = (uint32_t)(clampf(src_pixel[3], 0, 1) * (double)0xffffffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*8);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)src_pixel[0];
         *dst_pixel++ = (uint32_t)src_pixel[1];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*12);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)src_pixel[0];
         *dst_pixel++ = (uint32_t)src_pixel[1];
         *dst_pixel++ = (uint32_t)src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r32g32b32a32_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*16);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)src_pixel[0];
         *dst_pixel++ = (uint32_t)src_pixel[1];
         *dst_pixel++ = (uint32_t)src_pixel[2];
         *dst_pixel++ = (uint32_t)src_pixel[3];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[0], 0, 1) * 0xffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16g16_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[0], 0, 1) * 0xffff);
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[1], 0, 1) * 0xffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[0], 0, 1) * 0xffff);
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[1], 0, 1) * 0xffff);
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[2], 0, 1) * 0xffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16a16_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[0], 0, 1) * 0xffff);
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[1], 0, 1) * 0xffff);
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[2], 0, 1) * 0xffff);
         *dst_pixel++ = (uint16_t)(clampf(src_pixel[3], 0, 1) * 0xffff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16g16_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)src_pixel[0];
         *dst_pixel++ = (uint16_t)src_pixel[1];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)src_pixel[0];
         *dst_pixel++ = (uint16_t)src_pixel[1];
         *dst_pixel++ = (uint16_t)src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r16g16b16a16_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)src_pixel[0];
         *dst_pixel++ = (uint16_t)src_pixel[1];
         *dst_pixel++ = (uint16_t)src_pixel[2];
         *dst_pixel++ = (uint16_t)src_pixel[3];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[0], 0, 1) * 0xff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[0], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[1], 0, 1) * 0xff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[0], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[1], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8a8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[0], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[1], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[3], 0, 1) * 0xff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8x8_unorm_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[0], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[1], 0, 1) * 0xff);
         *dst_pixel++ = (uint8_t)(clampf(src_pixel[2], 0, 1) * 0xff);
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*2);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)src_pixel[0];
         *dst_pixel++ = (uint8_t)src_pixel[1];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)src_pixel[0];
         *dst_pixel++ = (uint8_t)src_pixel[1];
         *dst_pixel++ = (uint8_t)src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8a8_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)src_pixel[0];
         *dst_pixel++ = (uint8_t)src_pixel[1];
         *dst_pixel++ = (uint8_t)src_pixel[2];
         *dst_pixel++ = (uint8_t)src_pixel[3];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

static void
util_format_r8g8b8x8_uscaled_write_4f(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const float *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const float *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)src_pixel[0];
         *dst_pixel++ = (uint8_t)src_pixel[1];
         *dst_pixel++ = (uint8_t)src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(float);
   }
}

void
util_format_write_4f(enum pipe_format format, const float *src, unsigned src_stride, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)
{
   void (*func)(const float *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);
   switch(format) {
   case PIPE_FORMAT_A8R8G8B8_UNORM:
      func = &util_format_a8r8g8b8_unorm_write_4f;
      break;
   case PIPE_FORMAT_X8R8G8B8_UNORM:
      func = &util_format_x8r8g8b8_unorm_write_4f;
      break;
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      func = &util_format_b8g8r8a8_unorm_write_4f;
      break;
   case PIPE_FORMAT_B8G8R8X8_UNORM:
      func = &util_format_b8g8r8x8_unorm_write_4f;
      break;
   case PIPE_FORMAT_A1R5G5B5_UNORM:
      func = &util_format_a1r5g5b5_unorm_write_4f;
      break;
   case PIPE_FORMAT_A4R4G4B4_UNORM:
      func = &util_format_a4r4g4b4_unorm_write_4f;
      break;
   case PIPE_FORMAT_R5G6B5_UNORM:
      func = &util_format_r5g6b5_unorm_write_4f;
      break;
   case PIPE_FORMAT_A2B10G10R10_UNORM:
      func = &util_format_a2b10g10r10_unorm_write_4f;
      break;
   case PIPE_FORMAT_L8_UNORM:
      func = &util_format_l8_unorm_write_4f;
      break;
   case PIPE_FORMAT_A8_UNORM:
      func = &util_format_a8_unorm_write_4f;
      break;
   case PIPE_FORMAT_I8_UNORM:
      func = &util_format_i8_unorm_write_4f;
      break;
   case PIPE_FORMAT_A8L8_UNORM:
      func = &util_format_a8l8_unorm_write_4f;
      break;
   case PIPE_FORMAT_L16_UNORM:
      func = &util_format_l16_unorm_write_4f;
      break;
   case PIPE_FORMAT_Z16_UNORM:
      func = &util_format_z16_unorm_write_4f;
      break;
   case PIPE_FORMAT_Z32_UNORM:
      func = &util_format_z32_unorm_write_4f;
      break;
   case PIPE_FORMAT_Z32_FLOAT:
      func = &util_format_z32_float_write_4f;
      break;
   case PIPE_FORMAT_S8Z24_UNORM:
      func = &util_format_s8z24_unorm_write_4f;
      break;
   case PIPE_FORMAT_Z24S8_UNORM:
      func = &util_format_z24s8_unorm_write_4f;
      break;
   case PIPE_FORMAT_X8Z24_UNORM:
      func = &util_format_x8z24_unorm_write_4f;
      break;
   case PIPE_FORMAT_Z24X8_UNORM:
      func = &util_format_z24x8_unorm_write_4f;
      break;
   case PIPE_FORMAT_R64_FLOAT:
      func = &util_format_r64_float_write_4f;
      break;
   case PIPE_FORMAT_R64G64_FLOAT:
      func = &util_format_r64g64_float_write_4f;
      break;
   case PIPE_FORMAT_R64G64B64_FLOAT:
      func = &util_format_r64g64b64_float_write_4f;
      break;
   case PIPE_FORMAT_R64G64B64A64_FLOAT:
      func = &util_format_r64g64b64a64_float_write_4f;
      break;
   case PIPE_FORMAT_R32_FLOAT:
      func = &util_format_r32_float_write_4f;
      break;
   case PIPE_FORMAT_R32G32_FLOAT:
      func = &util_format_r32g32_float_write_4f;
      break;
   case PIPE_FORMAT_R32G32B32_FLOAT:
      func = &util_format_r32g32b32_float_write_4f;
      break;
   case PIPE_FORMAT_R32G32B32A32_FLOAT:
      func = &util_format_r32g32b32a32_float_write_4f;
      break;
   case PIPE_FORMAT_R32_UNORM:
      func = &util_format_r32_unorm_write_4f;
      break;
   case PIPE_FORMAT_R32G32_UNORM:
      func = &util_format_r32g32_unorm_write_4f;
      break;
   case PIPE_FORMAT_R32G32B32_UNORM:
      func = &util_format_r32g32b32_unorm_write_4f;
      break;
   case PIPE_FORMAT_R32G32B32A32_UNORM:
      func = &util_format_r32g32b32a32_unorm_write_4f;
      break;
   case PIPE_FORMAT_R32_USCALED:
      func = &util_format_r32_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R32G32_USCALED:
      func = &util_format_r32g32_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R32G32B32_USCALED:
      func = &util_format_r32g32b32_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R32G32B32A32_USCALED:
      func = &util_format_r32g32b32a32_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R16_UNORM:
      func = &util_format_r16_unorm_write_4f;
      break;
   case PIPE_FORMAT_R16G16_UNORM:
      func = &util_format_r16g16_unorm_write_4f;
      break;
   case PIPE_FORMAT_R16G16B16_UNORM:
      func = &util_format_r16g16b16_unorm_write_4f;
      break;
   case PIPE_FORMAT_R16G16B16A16_UNORM:
      func = &util_format_r16g16b16a16_unorm_write_4f;
      break;
   case PIPE_FORMAT_R16_USCALED:
      func = &util_format_r16_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R16G16_USCALED:
      func = &util_format_r16g16_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R16G16B16_USCALED:
      func = &util_format_r16g16b16_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R16G16B16A16_USCALED:
      func = &util_format_r16g16b16a16_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R8_UNORM:
      func = &util_format_r8_unorm_write_4f;
      break;
   case PIPE_FORMAT_R8G8_UNORM:
      func = &util_format_r8g8_unorm_write_4f;
      break;
   case PIPE_FORMAT_R8G8B8_UNORM:
      func = &util_format_r8g8b8_unorm_write_4f;
      break;
   case PIPE_FORMAT_R8G8B8A8_UNORM:
      func = &util_format_r8g8b8a8_unorm_write_4f;
      break;
   case PIPE_FORMAT_R8G8B8X8_UNORM:
      func = &util_format_r8g8b8x8_unorm_write_4f;
      break;
   case PIPE_FORMAT_R8_USCALED:
      func = &util_format_r8_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R8G8_USCALED:
      func = &util_format_r8g8_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R8G8B8_USCALED:
      func = &util_format_r8g8b8_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R8G8B8A8_USCALED:
      func = &util_format_r8g8b8a8_uscaled_write_4f;
      break;
   case PIPE_FORMAT_R8G8B8X8_USCALED:
      func = &util_format_r8g8b8x8_uscaled_write_4f;
      break;
   default:
      debug_printf("unsupported format\n");
      return;
   }
   func(src, src_stride, (uint8_t *)dst, dst_stride, x, y, w, h);
}

static void
util_format_a8r8g8b8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t b = (pixel & 0xff);
         uint8_t g = ((pixel >> 8) & 0xff);
         uint8_t r = ((pixel >> 16) & 0xff);
         uint8_t a = (pixel >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_x8r8g8b8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t b = (pixel & 0xff);
         uint8_t g = ((pixel >> 8) & 0xff);
         uint8_t r = ((pixel >> 16) & 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_b8g8r8a8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t a = (pixel & 0xff);
         uint8_t r = ((pixel >> 8) & 0xff);
         uint8_t g = ((pixel >> 16) & 0xff);
         uint8_t b = (pixel >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_b8g8r8x8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t r = ((pixel >> 8) & 0xff);
         uint8_t g = ((pixel >> 16) & 0xff);
         uint8_t b = (pixel >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_a1r5g5b5_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         uint8_t b = (uint8_t)(uint32_t)(pixel & 0x1) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)((pixel >> 1) & 0x1f) * 0xff / 0x1f;
         uint8_t r = (uint8_t)(uint32_t)((pixel >> 6) & 0x1f) * 0xff / 0x1f;
         uint8_t a = (uint8_t)(uint32_t)(pixel >> 11) * 0xff / 0x1f;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_a4r4g4b4_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         uint8_t b = (uint8_t)(uint32_t)(pixel & 0xf) * 0xff / 0xf;
         uint8_t g = (uint8_t)(uint32_t)((pixel >> 4) & 0xf) * 0xff / 0xf;
         uint8_t r = (uint8_t)(uint32_t)((pixel >> 8) & 0xf) * 0xff / 0xf;
         uint8_t a = (uint8_t)(uint32_t)(pixel >> 12) * 0xff / 0xf;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r5g6b5_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         uint8_t b = (uint8_t)(uint32_t)(pixel & 0x1f) * 0xff / 0x1f;
         uint8_t g = (uint8_t)(uint32_t)((pixel >> 5) & 0x3f) * 0xff / 0x3f;
         uint8_t r = (uint8_t)(uint32_t)(pixel >> 11) * 0xff / 0x1f;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_a2b10g10r10_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t r = (uint8_t)((pixel & 0x3ff) >> 2);
         uint8_t g = (uint8_t)(((pixel >> 10) & 0x3ff) >> 2);
         uint8_t b = (uint8_t)(((pixel >> 20) & 0x3ff) >> 2);
         uint8_t a = (uint8_t)(uint32_t)(pixel >> 30) * 0xff / 0x3;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_l8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = *src_pixel++;
         uint8_t rgb = pixel;
         *dst_pixel++ = rgb; /* r */
         *dst_pixel++ = rgb; /* g */
         *dst_pixel++ = rgb; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_a8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = *src_pixel++;
         uint8_t a = pixel;
         *dst_pixel++ = 0; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_i8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = *src_pixel++;
         uint8_t rgba = pixel;
         *dst_pixel++ = rgba; /* r */
         *dst_pixel++ = rgba; /* g */
         *dst_pixel++ = rgba; /* b */
         *dst_pixel++ = rgba; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_a8l8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         uint8_t rgb = (pixel & 0xff);
         uint8_t a = (pixel >> 8);
         *dst_pixel++ = rgb; /* r */
         *dst_pixel++ = rgb; /* g */
         *dst_pixel++ = rgb; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_l16_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = *src_pixel++;
         uint8_t rgb = (uint8_t)(pixel >> 8);
         *dst_pixel++ = rgb; /* r */
         *dst_pixel++ = rgb; /* g */
         *dst_pixel++ = rgb; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_z16_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t z = (uint8_t)((*src_pixel++) >> 8);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_z32_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t z = (uint8_t)((*src_pixel++) >> 24);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_z32_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t z = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_s8z24_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t z = (uint8_t)((pixel >> 8) >> 16);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_z24s8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t z = (uint8_t)((pixel & 0xffffff) >> 16);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_x8z24_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t z = (uint8_t)((pixel >> 8) >> 16);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_z24x8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = *src_pixel++;
         uint8_t z = (uint8_t)((pixel & 0xffffff) >> 16);
         *dst_pixel++ = z; /* r */
         *dst_pixel++ = z; /* g */
         *dst_pixel++ = z; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*8);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64g64_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*16);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         uint8_t g = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64g64b64_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*24);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         uint8_t g = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         uint8_t b = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64g64b64a64_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const double *src_pixel = (const double *)(src_row + x0*32);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         uint8_t g = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         uint8_t b = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         uint8_t a = (uint8_t)(clamp((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*8);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         uint8_t g = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*12);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         uint8_t g = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         uint8_t b = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32a32_float_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const float *src_pixel = (const float *)(src_row + x0*16);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         uint8_t g = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         uint8_t b = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         uint8_t a = (uint8_t)(clampf((*src_pixel++), 0, 1) * 0xff);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*8);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 24);
         uint8_t g = (uint8_t)((*src_pixel++) >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*12);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 24);
         uint8_t g = (uint8_t)((*src_pixel++) >> 24);
         uint8_t b = (uint8_t)((*src_pixel++) >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32a32_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*16);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 24);
         uint8_t g = (uint8_t)((*src_pixel++) >> 24);
         uint8_t b = (uint8_t)((*src_pixel++) >> 24);
         uint8_t a = (uint8_t)((*src_pixel++) >> 24);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*8);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*12);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32a32_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*16);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t a = (uint8_t)(uint64_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 8);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 8);
         uint8_t g = (uint8_t)((*src_pixel++) >> 8);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 8);
         uint8_t g = (uint8_t)((*src_pixel++) >> 8);
         uint8_t b = (uint8_t)((*src_pixel++) >> 8);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16a16_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)((*src_pixel++) >> 8);
         uint8_t g = (uint8_t)((*src_pixel++) >> 8);
         uint8_t b = (uint8_t)((*src_pixel++) >> 8);
         uint8_t a = (uint8_t)((*src_pixel++) >> 8);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16a16_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t a = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (*src_pixel++);
         uint8_t g = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (*src_pixel++);
         uint8_t g = (*src_pixel++);
         uint8_t b = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8a8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (*src_pixel++);
         uint8_t g = (*src_pixel++);
         uint8_t b = (*src_pixel++);
         uint8_t a = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8x8_unorm_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (*src_pixel++);
         uint8_t g = (*src_pixel++);
         uint8_t b = (*src_pixel++);
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = 0; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = 0; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8a8_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t a = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = a; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8x8_uscaled_read_4ub(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   const uint8_t *src_row = src + y0*src_stride;
   uint8_t *dst_row = dst;
   for (y = 0; y < h; ++y) {
      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
      uint8_t *dst_pixel = dst_row;
      for (x = 0; x < w; ++x) {
         uint8_t r = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t g = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         uint8_t b = (uint8_t)(uint32_t)(*src_pixel++) * 0xff / 0x1;
         *dst_pixel++ = r; /* r */
         *dst_pixel++ = g; /* g */
         *dst_pixel++ = b; /* b */
         *dst_pixel++ = 1; /* a */
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(uint8_t);
   }
}

void
util_format_read_4ub(enum pipe_format format, uint8_t *dst, unsigned dst_stride, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)
{
   void (*func)(uint8_t *dst, unsigned dst_stride, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);
   switch(format) {
   case PIPE_FORMAT_A8R8G8B8_UNORM:
      func = &util_format_a8r8g8b8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_X8R8G8B8_UNORM:
      func = &util_format_x8r8g8b8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      func = &util_format_b8g8r8a8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_B8G8R8X8_UNORM:
      func = &util_format_b8g8r8x8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_A1R5G5B5_UNORM:
      func = &util_format_a1r5g5b5_unorm_read_4ub;
      break;
   case PIPE_FORMAT_A4R4G4B4_UNORM:
      func = &util_format_a4r4g4b4_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R5G6B5_UNORM:
      func = &util_format_r5g6b5_unorm_read_4ub;
      break;
   case PIPE_FORMAT_A2B10G10R10_UNORM:
      func = &util_format_a2b10g10r10_unorm_read_4ub;
      break;
   case PIPE_FORMAT_L8_UNORM:
      func = &util_format_l8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_A8_UNORM:
      func = &util_format_a8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_I8_UNORM:
      func = &util_format_i8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_A8L8_UNORM:
      func = &util_format_a8l8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_L16_UNORM:
      func = &util_format_l16_unorm_read_4ub;
      break;
   case PIPE_FORMAT_Z16_UNORM:
      func = &util_format_z16_unorm_read_4ub;
      break;
   case PIPE_FORMAT_Z32_UNORM:
      func = &util_format_z32_unorm_read_4ub;
      break;
   case PIPE_FORMAT_Z32_FLOAT:
      func = &util_format_z32_float_read_4ub;
      break;
   case PIPE_FORMAT_S8Z24_UNORM:
      func = &util_format_s8z24_unorm_read_4ub;
      break;
   case PIPE_FORMAT_Z24S8_UNORM:
      func = &util_format_z24s8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_X8Z24_UNORM:
      func = &util_format_x8z24_unorm_read_4ub;
      break;
   case PIPE_FORMAT_Z24X8_UNORM:
      func = &util_format_z24x8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R64_FLOAT:
      func = &util_format_r64_float_read_4ub;
      break;
   case PIPE_FORMAT_R64G64_FLOAT:
      func = &util_format_r64g64_float_read_4ub;
      break;
   case PIPE_FORMAT_R64G64B64_FLOAT:
      func = &util_format_r64g64b64_float_read_4ub;
      break;
   case PIPE_FORMAT_R64G64B64A64_FLOAT:
      func = &util_format_r64g64b64a64_float_read_4ub;
      break;
   case PIPE_FORMAT_R32_FLOAT:
      func = &util_format_r32_float_read_4ub;
      break;
   case PIPE_FORMAT_R32G32_FLOAT:
      func = &util_format_r32g32_float_read_4ub;
      break;
   case PIPE_FORMAT_R32G32B32_FLOAT:
      func = &util_format_r32g32b32_float_read_4ub;
      break;
   case PIPE_FORMAT_R32G32B32A32_FLOAT:
      func = &util_format_r32g32b32a32_float_read_4ub;
      break;
   case PIPE_FORMAT_R32_UNORM:
      func = &util_format_r32_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R32G32_UNORM:
      func = &util_format_r32g32_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R32G32B32_UNORM:
      func = &util_format_r32g32b32_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R32G32B32A32_UNORM:
      func = &util_format_r32g32b32a32_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R32_USCALED:
      func = &util_format_r32_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R32G32_USCALED:
      func = &util_format_r32g32_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R32G32B32_USCALED:
      func = &util_format_r32g32b32_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R32G32B32A32_USCALED:
      func = &util_format_r32g32b32a32_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R16_UNORM:
      func = &util_format_r16_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R16G16_UNORM:
      func = &util_format_r16g16_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R16G16B16_UNORM:
      func = &util_format_r16g16b16_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R16G16B16A16_UNORM:
      func = &util_format_r16g16b16a16_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R16_USCALED:
      func = &util_format_r16_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R16G16_USCALED:
      func = &util_format_r16g16_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R16G16B16_USCALED:
      func = &util_format_r16g16b16_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R16G16B16A16_USCALED:
      func = &util_format_r16g16b16a16_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R8_UNORM:
      func = &util_format_r8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R8G8_UNORM:
      func = &util_format_r8g8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R8G8B8_UNORM:
      func = &util_format_r8g8b8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R8G8B8A8_UNORM:
      func = &util_format_r8g8b8a8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R8G8B8X8_UNORM:
      func = &util_format_r8g8b8x8_unorm_read_4ub;
      break;
   case PIPE_FORMAT_R8_USCALED:
      func = &util_format_r8_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R8G8_USCALED:
      func = &util_format_r8g8_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R8G8B8_USCALED:
      func = &util_format_r8g8b8_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R8G8B8A8_USCALED:
      func = &util_format_r8g8b8a8_uscaled_read_4ub;
      break;
   case PIPE_FORMAT_R8G8B8X8_USCALED:
      func = &util_format_r8g8b8x8_uscaled_read_4ub;
      break;
   default:
      debug_printf("unsupported format\n");
      return;
   }
   func(dst, dst_stride, (const uint8_t *)src, src_stride, x, y, w, h);
}

static void
util_format_a8r8g8b8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= src_pixel[2];
         pixel |= (src_pixel[1] << 8);
         pixel |= (src_pixel[0] << 16);
         pixel |= (src_pixel[3] << 24);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_x8r8g8b8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= src_pixel[2];
         pixel |= (src_pixel[1] << 8);
         pixel |= (src_pixel[0] << 16);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_b8g8r8a8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= src_pixel[3];
         pixel |= (src_pixel[0] << 8);
         pixel |= (src_pixel[1] << 16);
         pixel |= (src_pixel[2] << 24);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_b8g8r8x8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (src_pixel[0] << 8);
         pixel |= (src_pixel[1] << 16);
         pixel |= (src_pixel[2] << 24);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_a1r5g5b5_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(src_pixel[2] >> 7);
         pixel |= ((uint16_t)(src_pixel[1] >> 3) << 1);
         pixel |= ((uint16_t)(src_pixel[0] >> 3) << 6);
         pixel |= ((uint16_t)(src_pixel[3] >> 3) << 11);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_a4r4g4b4_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(src_pixel[2] >> 4);
         pixel |= ((uint16_t)(src_pixel[1] >> 4) << 4);
         pixel |= ((uint16_t)(src_pixel[0] >> 4) << 8);
         pixel |= ((uint16_t)(src_pixel[3] >> 4) << 12);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r5g6b5_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(src_pixel[2] >> 3);
         pixel |= ((uint16_t)(src_pixel[1] >> 2) << 5);
         pixel |= ((uint16_t)(src_pixel[0] >> 3) << 11);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_a2b10g10r10_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(uint32_t)src_pixel[0] * 0x3ff / 0xff;
         pixel |= ((uint32_t)(uint32_t)src_pixel[1] * 0x3ff / 0xff << 10);
         pixel |= ((uint32_t)(uint32_t)src_pixel[2] * 0x3ff / 0xff << 20);
         pixel |= ((uint32_t)(src_pixel[3] >> 6) << 30);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_l8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = 0;
         pixel |= src_pixel[2];
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_a8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = 0;
         pixel |= src_pixel[3];
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_i8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint8_t pixel = 0;
         pixel |= src_pixel[3];
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_a8l8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= src_pixel[2];
         pixel |= (src_pixel[3] << 8);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_l16_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint16_t pixel = 0;
         pixel |= (uint16_t)(uint32_t)src_pixel[2] * 0xffff / 0xff;
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_z16_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0xffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_z32_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0xffffffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_z32_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (float)(src_pixel[0] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_s8z24_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= ((uint32_t)(uint32_t)src_pixel[0] * 0xffffff / 0xff << 8);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_z24s8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(uint32_t)src_pixel[0] * 0xffffff / 0xff;
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_x8z24_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= ((uint32_t)(uint32_t)src_pixel[0] * 0xffffff / 0xff << 8);
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_z24x8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         uint32_t pixel = 0;
         pixel |= (uint32_t)(uint32_t)src_pixel[0] * 0xffffff / 0xff;
         *dst_pixel++ = pixel;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*8);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)(src_pixel[0] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64g64_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*16);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)(src_pixel[0] * (1.0f/0xff));
         *dst_pixel++ = (double)(src_pixel[1] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64g64b64_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*24);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)(src_pixel[0] * (1.0f/0xff));
         *dst_pixel++ = (double)(src_pixel[1] * (1.0f/0xff));
         *dst_pixel++ = (double)(src_pixel[2] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r64g64b64a64_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      double *dst_pixel = (double *)(dst_row + x0*32);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (double)(src_pixel[0] * (1.0f/0xff));
         *dst_pixel++ = (double)(src_pixel[1] * (1.0f/0xff));
         *dst_pixel++ = (double)(src_pixel[2] * (1.0f/0xff));
         *dst_pixel++ = (double)(src_pixel[3] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (float)(src_pixel[0] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*8);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (float)(src_pixel[0] * (1.0f/0xff));
         *dst_pixel++ = (float)(src_pixel[1] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*12);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (float)(src_pixel[0] * (1.0f/0xff));
         *dst_pixel++ = (float)(src_pixel[1] * (1.0f/0xff));
         *dst_pixel++ = (float)(src_pixel[2] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32a32_float_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      float *dst_pixel = (float *)(dst_row + x0*16);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (float)(src_pixel[0] * (1.0f/0xff));
         *dst_pixel++ = (float)(src_pixel[1] * (1.0f/0xff));
         *dst_pixel++ = (float)(src_pixel[2] * (1.0f/0xff));
         *dst_pixel++ = (float)(src_pixel[3] * (1.0f/0xff));
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0xffffffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*8);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0xffffffff / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[1] * 0xffffffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*12);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0xffffffff / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[1] * 0xffffffff / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[2] * 0xffffffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32a32_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*16);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0xffffffff / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[1] * 0xffffffff / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[2] * 0xffffffff / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[3] * 0xffffffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*8);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[1] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*12);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[2] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r32g32b32a32_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*16);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[2] * 0x1 / 0xff;
         *dst_pixel++ = (uint32_t)(uint64_t)src_pixel[3] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0xffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0xffff / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[1] * 0xffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0xffff / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[1] * 0xffff / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[2] * 0xffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16a16_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0xffff / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[1] * 0xffff / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[2] * 0xffff / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[3] * 0xffff / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[2] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r16g16b16a16_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[2] * 0x1 / 0xff;
         *dst_pixel++ = (uint16_t)(uint32_t)src_pixel[3] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         *dst_pixel++ = src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8a8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         *dst_pixel++ = src_pixel[2];
         *dst_pixel++ = src_pixel[3];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8x8_unorm_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = src_pixel[0];
         *dst_pixel++ = src_pixel[1];
         *dst_pixel++ = src_pixel[2];
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*2);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[2] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8a8_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[2] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[3] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

static void
util_format_r8g8b8x8_uscaled_write_4ub(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)
{
   unsigned x, y;
   uint8_t *dst_row = dst + y0*dst_stride;
   const uint8_t *src_row = src;
   for (y = 0; y < h; ++y) {
      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
      const uint8_t *src_pixel = src_row;
      for (x = 0; x < w; ++x) {
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[0] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[1] * 0x1 / 0xff;
         *dst_pixel++ = (uint8_t)(uint32_t)src_pixel[2] * 0x1 / 0xff;
         src_pixel += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(uint8_t);
   }
}

void
util_format_write_4ub(enum pipe_format format, const uint8_t *src, unsigned src_stride, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)
{
   void (*func)(const uint8_t *src, unsigned src_stride, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);
   switch(format) {
   case PIPE_FORMAT_A8R8G8B8_UNORM:
      func = &util_format_a8r8g8b8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_X8R8G8B8_UNORM:
      func = &util_format_x8r8g8b8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      func = &util_format_b8g8r8a8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_B8G8R8X8_UNORM:
      func = &util_format_b8g8r8x8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_A1R5G5B5_UNORM:
      func = &util_format_a1r5g5b5_unorm_write_4ub;
      break;
   case PIPE_FORMAT_A4R4G4B4_UNORM:
      func = &util_format_a4r4g4b4_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R5G6B5_UNORM:
      func = &util_format_r5g6b5_unorm_write_4ub;
      break;
   case PIPE_FORMAT_A2B10G10R10_UNORM:
      func = &util_format_a2b10g10r10_unorm_write_4ub;
      break;
   case PIPE_FORMAT_L8_UNORM:
      func = &util_format_l8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_A8_UNORM:
      func = &util_format_a8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_I8_UNORM:
      func = &util_format_i8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_A8L8_UNORM:
      func = &util_format_a8l8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_L16_UNORM:
      func = &util_format_l16_unorm_write_4ub;
      break;
   case PIPE_FORMAT_Z16_UNORM:
      func = &util_format_z16_unorm_write_4ub;
      break;
   case PIPE_FORMAT_Z32_UNORM:
      func = &util_format_z32_unorm_write_4ub;
      break;
   case PIPE_FORMAT_Z32_FLOAT:
      func = &util_format_z32_float_write_4ub;
      break;
   case PIPE_FORMAT_S8Z24_UNORM:
      func = &util_format_s8z24_unorm_write_4ub;
      break;
   case PIPE_FORMAT_Z24S8_UNORM:
      func = &util_format_z24s8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_X8Z24_UNORM:
      func = &util_format_x8z24_unorm_write_4ub;
      break;
   case PIPE_FORMAT_Z24X8_UNORM:
      func = &util_format_z24x8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R64_FLOAT:
      func = &util_format_r64_float_write_4ub;
      break;
   case PIPE_FORMAT_R64G64_FLOAT:
      func = &util_format_r64g64_float_write_4ub;
      break;
   case PIPE_FORMAT_R64G64B64_FLOAT:
      func = &util_format_r64g64b64_float_write_4ub;
      break;
   case PIPE_FORMAT_R64G64B64A64_FLOAT:
      func = &util_format_r64g64b64a64_float_write_4ub;
      break;
   case PIPE_FORMAT_R32_FLOAT:
      func = &util_format_r32_float_write_4ub;
      break;
   case PIPE_FORMAT_R32G32_FLOAT:
      func = &util_format_r32g32_float_write_4ub;
      break;
   case PIPE_FORMAT_R32G32B32_FLOAT:
      func = &util_format_r32g32b32_float_write_4ub;
      break;
   case PIPE_FORMAT_R32G32B32A32_FLOAT:
      func = &util_format_r32g32b32a32_float_write_4ub;
      break;
   case PIPE_FORMAT_R32_UNORM:
      func = &util_format_r32_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R32G32_UNORM:
      func = &util_format_r32g32_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R32G32B32_UNORM:
      func = &util_format_r32g32b32_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R32G32B32A32_UNORM:
      func = &util_format_r32g32b32a32_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R32_USCALED:
      func = &util_format_r32_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R32G32_USCALED:
      func = &util_format_r32g32_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R32G32B32_USCALED:
      func = &util_format_r32g32b32_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R32G32B32A32_USCALED:
      func = &util_format_r32g32b32a32_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R16_UNORM:
      func = &util_format_r16_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R16G16_UNORM:
      func = &util_format_r16g16_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R16G16B16_UNORM:
      func = &util_format_r16g16b16_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R16G16B16A16_UNORM:
      func = &util_format_r16g16b16a16_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R16_USCALED:
      func = &util_format_r16_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R16G16_USCALED:
      func = &util_format_r16g16_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R16G16B16_USCALED:
      func = &util_format_r16g16b16_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R16G16B16A16_USCALED:
      func = &util_format_r16g16b16a16_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R8_UNORM:
      func = &util_format_r8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R8G8_UNORM:
      func = &util_format_r8g8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R8G8B8_UNORM:
      func = &util_format_r8g8b8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R8G8B8A8_UNORM:
      func = &util_format_r8g8b8a8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R8G8B8X8_UNORM:
      func = &util_format_r8g8b8x8_unorm_write_4ub;
      break;
   case PIPE_FORMAT_R8_USCALED:
      func = &util_format_r8_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R8G8_USCALED:
      func = &util_format_r8g8_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R8G8B8_USCALED:
      func = &util_format_r8g8b8_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R8G8B8A8_USCALED:
      func = &util_format_r8g8b8a8_uscaled_write_4ub;
      break;
   case PIPE_FORMAT_R8G8B8X8_USCALED:
      func = &util_format_r8g8b8x8_uscaled_write_4ub;
      break;
   default:
      debug_printf("unsupported format\n");
      return;
   }
   func(src, src_stride, (uint8_t *)dst, dst_stride, x, y, w, h);
}

