/* * Copyright (c) 2011-2013 Luc Verhaegen * Copyright (c) 2018 Alyssa Rosenzweig * Copyright (c) 2018 Vasily Khoruzhick * Copyright (c) 2019 Collabora, Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * */ #include "pan_tiling.h" #include #include "util/macros.h" /* This file implements software encode/decode of the tiling format used for * textures and framebuffers primarily on Utgard GPUs. Names for this format * include "Utgard-style tiling", "(Mali) swizzled textures", and * "U-interleaved" (the former two names being used in the community * Lima/Panfrost drivers; the latter name used internally at Arm). * Conceptually, like any tiling scheme, the pixel reordering attempts to 2D * spatial locality, to improve cache locality in both horizontal and vertical * directions. * * This format is tiled: first, the image dimensions must be aligned to 16 * pixels in each axis. Once aligned, the image is divided into 16x16 tiles. * This size harmonizes with other properties of the GPU; on Midgard, * framebuffer tiles are logically 16x16 (this is the tile size used in * Transaction Elimination and the minimum tile size used in Hierarchical * Tiling). Conversely, for a standard 4 bytes-per-pixel format (like * RGBA8888), 16 pixels * 4 bytes/pixel = 64 bytes, equal to the cache line * size. * * Within each 16x16 block, the bits are reordered according to this pattern: * * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) | * * Basically, interleaving the X and Y bits, with XORs thrown in for every * adjacent bit pair. * * This is cheap to implement both encode/decode in both hardware and software. * In hardware, lines are simply rerouted to reorder and some XOR gates are * thrown in. Software has to be a bit more clever. * * In software, the trick is to divide the pattern into two lines: * * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 | * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 | * * That is, duplicate the bits of the Y and space out the bits of the X. The * top line is a function only of Y, so it can be calculated once per row and * stored in a register. The bottom line is simply X with the bits spaced out. * Spacing out the X is easy enough with a LUT, or by subtracting+ANDing the * mask pattern (abusing carry bits). * * This format is also supported on Midgard GPUs, where it *can* be used for * textures and framebuffers. That said, in practice it is usually as a * fallback layout; Midgard introduces Arm FrameBuffer Compression, which is * significantly more efficient than Utgard-style tiling and preferred for both * textures and framebuffers, where possible. For unsupported texture types, * for instance sRGB textures and framebuffers, this tiling scheme is used at a * performance penalty, as AFBC is not compatible. */ /* Given the lower 4-bits of the Y coordinate, we would like to * duplicate every bit over. So instead of 0b1010, we would like * 0b11001100. The idea is that for the bits in the solely Y place, we * get a Y place, and the bits in the XOR place *also* get a Y. */ const uint32_t bit_duplication[16] = { 0b00000000, 0b00000011, 0b00001100, 0b00001111, 0b00110000, 0b00110011, 0b00111100, 0b00111111, 0b11000000, 0b11000011, 0b11001100, 0b11001111, 0b11110000, 0b11110011, 0b11111100, 0b11111111, }; /* Space the bits out of a 4-bit nibble */ const unsigned space_4[16] = { 0b0000000, 0b0000001, 0b0000100, 0b0000101, 0b0010000, 0b0010001, 0b0010100, 0b0010101, 0b1000000, 0b1000001, 0b1000100, 0b1000101, 0b1010000, 0b1010001, 0b1010100, 0b1010101 }; /* The scheme uses 16x16 tiles */ #define TILE_WIDTH 16 #define TILE_HEIGHT 16 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must * only support copies and sizeof, so emulating with a packed structure works * well enough, but if there's a native 128-bit type we may we well prefer * that. */ #ifdef __SIZEOF_INT128__ typedef __uint128_t pan_uint128_t; #else typedef struct { uint64_t lo; uint64_t hi; } __attribute__((packed)) pan_uint128_t; #endif typedef struct { uint16_t lo; uint8_t hi; } __attribute__((packed)) pan_uint24_t; /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: * * dest_start precomputes the offset to the beginning of the first horizontal * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are * stored linearly, so we get the X tile number by shifting and then multiply * by the bytes per tile . * * We iterate across the pixels we're trying to store in source-order. For each * row in the destination image, we figure out which row of 16x16 block we're * in, by slicing off the lower 4-bits (block_y). * * dest then precomputes the location of the top-left corner of the block the * row starts in. In pixel coordinates (where the origin is the top-left), * (block_y, 0) is the top-left corner of the leftmost tile in this row. While * pixels are reordered within a block, the blocks themselves are stored * linearly, so multiplying block_y by the pixel stride of the destination * image equals the byte offset of that top-left corner of the block this row * is in. * * On the other hand, the source is linear so we compute the locations of the * start and end of the row in the source by a simple linear addressing. * * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0 * y0] value. Since this is constant across a row, we look it up per-row and * store in expanded_y. * * Finally, we iterate each row in source order. In the outer loop, we iterate * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should * be unrolled), calculating the index within the tile and writing. */ #define TILED_ACCESS_TYPE(pixel_t, shift) \ static ALWAYS_INLINE void \ panfrost_access_tiled_image_##pixel_t \ (void *dst, void *src, \ uint16_t sx, uint16_t sy, \ uint16_t w, uint16_t h, \ uint32_t dst_stride, \ uint32_t src_stride, \ bool is_store) \ { \ uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ uint16_t block_y = y & ~0x0f; \ uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \ pixel_t *source = src + (src_y * src_stride); \ pixel_t *source_end = source + w; \ unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ for (uint8_t i = 0; i < 16; ++i) { \ unsigned index = expanded_y ^ (space_4[i] << shift); \ if (is_store) \ *((pixel_t *) (dest + index)) = *(source++); \ else \ *(source++) = *((pixel_t *) (dest + index)); \ } \ } \ } \ } \ TILED_ACCESS_TYPE(uint8_t, 0); TILED_ACCESS_TYPE(uint16_t, 1); TILED_ACCESS_TYPE(uint32_t, 2); TILED_ACCESS_TYPE(uint64_t, 3); TILED_ACCESS_TYPE(pan_uint128_t, 4); #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ const unsigned mask = (1 << tile_shift) - 1; \ for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ unsigned block_y = y & ~mask; \ unsigned block_start_s = block_y * dst_stride; \ unsigned source_start = src_y * src_stride; \ unsigned expanded_y = bit_duplication[y & mask]; \ \ for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ unsigned index = expanded_y ^ space_4[x & mask]; \ uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ \ pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ *outp = *inp; \ } \ } \ } #define TILED_UNALIGNED_TYPES(store, shift) { \ if (bpp == 8) \ TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ else if (bpp == 16) \ TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ else if (bpp == 24) \ TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \ else if (bpp == 32) \ TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ else if (bpp == 64) \ TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ else if (bpp == 128) \ TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ } /* * Perform a generic access to a tiled image with a given format. This works * even for block-compressed images on entire blocks at a time. sx/sy/w/h are * specified in pixels, not blocks, but our internal routines work in blocks, * so we divide here. Alignment is assumed. */ static void panfrost_access_tiled_image_generic(void *dst, void *src, unsigned sx, unsigned sy, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, const struct util_format_description *desc, bool _is_store) { unsigned bpp = desc->block.bits; /* Convert units */ sx /= desc->block.width; sy /= desc->block.height; w = DIV_ROUND_UP(w, desc->block.width); h = DIV_ROUND_UP(h, desc->block.height); if (desc->block.width > 1) { if (_is_store) TILED_UNALIGNED_TYPES(true, 2) else TILED_UNALIGNED_TYPES(false, 2) } else { if (_is_store) TILED_UNALIGNED_TYPES(true, 4) else TILED_UNALIGNED_TYPES(false, 4) } } #define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) static ALWAYS_INLINE void panfrost_access_tiled_image(void *dst, void *src, unsigned x, unsigned y, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, enum pipe_format format, bool is_store) { const struct util_format_description *desc = util_format_description(format); if (desc->block.width > 1 || desc->block.bits == 24) { panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, desc, is_store); return; } unsigned bpp = desc->block.bits; unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH; unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT; unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH; unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT; /* First, tile the top portion */ unsigned orig_x = x, orig_y = y; if (first_full_tile_y != y) { unsigned dist = MIN2(first_full_tile_y - y, h); panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), x, y, w, dist, dst_stride, src_stride, desc, is_store); if (dist == h) return; y += dist; h -= dist; } /* Next, the bottom portion */ if (last_full_tile_y != (y + h)) { unsigned dist = (y + h) - last_full_tile_y; panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), x, last_full_tile_y, w, dist, dst_stride, src_stride, desc, is_store); h -= dist; } /* The left portion */ if (first_full_tile_x != x) { unsigned dist = MIN2(first_full_tile_x - x, w); panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), x, y, dist, h, dst_stride, src_stride, desc, is_store); if (dist == w) return; x += dist; w -= dist; } /* Finally, the right portion */ if (last_full_tile_x != (x + w)) { unsigned dist = (x + w) - last_full_tile_x; panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), last_full_tile_x, y, dist, h, dst_stride, src_stride, desc, is_store); w -= dist; } if (bpp == 8) panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); else if (bpp == 16) panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); else if (bpp == 32) panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); else if (bpp == 64) panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); else if (bpp == 128) panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store); } /** * Access a tiled image (load or store). Note: the region of interest (x, y, w, * h) is specified in pixels, not blocks. It is expected that these quantities * are aligned to the block size. */ void panfrost_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, enum pipe_format format) { panfrost_access_tiled_image(dst, (void *) src, x, y, w, h, dst_stride, src_stride, format, true); } void panfrost_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, enum pipe_format format) { panfrost_access_tiled_image((void *) src, dst, x, y, w, h, src_stride, dst_stride, format, false); }