/*
 * Copyright (C) 2018 Alyssa Rosenzweig
 * Copyright (C) 2019-2021 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "pan_device.h"
#include "pan_shader.h"
#include "pan_format.h"

#if PAN_ARCH <= 5
#include "panfrost/midgard/midgard_compile.h"
#else
#include "panfrost/bifrost/bifrost_compile.h"
#endif

const nir_shader_compiler_options *
GENX(pan_shader_get_compiler_options)(void)
{
#if PAN_ARCH >= 6
        return &bifrost_nir_options;
#else
        return &midgard_nir_options;
#endif
}

static enum pipe_format
varying_format(nir_alu_type t, unsigned ncomps)
{
#define VARYING_FORMAT(ntype, nsz, ptype, psz) \
        { \
                .type = nir_type_ ## ntype ## nsz, \
                .formats = { \
                        PIPE_FORMAT_R ## psz ## _ ## ptype, \
                        PIPE_FORMAT_R ## psz ## G ## psz ## _ ## ptype, \
                        PIPE_FORMAT_R ## psz ## G ## psz ## B ## psz ## _ ## ptype, \
                        PIPE_FORMAT_R ## psz ## G ## psz ## B ## psz  ## A ## psz ## _ ## ptype, \
                } \
        }

        static const struct {
                nir_alu_type type;
                enum pipe_format formats[4];
        } conv[] = {
                VARYING_FORMAT(float, 32, FLOAT, 32),
                VARYING_FORMAT(int, 32, SINT, 32),
                VARYING_FORMAT(uint, 32, UINT, 32),
                VARYING_FORMAT(float, 16, FLOAT, 16),
                VARYING_FORMAT(int, 16, SINT, 16),
                VARYING_FORMAT(uint, 16, UINT, 16),
                VARYING_FORMAT(int, 8, SINT, 8),
                VARYING_FORMAT(uint, 8, UINT, 8),
                VARYING_FORMAT(bool, 32, UINT, 32),
                VARYING_FORMAT(bool, 16, UINT, 16),
                VARYING_FORMAT(bool, 8, UINT, 8),
                VARYING_FORMAT(bool, 1, UINT, 8),
        };
#undef VARYING_FORMAT

        assert(ncomps > 0 && ncomps <= ARRAY_SIZE(conv[0].formats));

        for (unsigned i = 0; i < ARRAY_SIZE(conv); i++) {
                if (conv[i].type == t)
                        return conv[i].formats[ncomps - 1];
        }

        return PIPE_FORMAT_NONE;
}

static void
collect_varyings(nir_shader *s, nir_variable_mode varying_mode,
                 struct pan_shader_varying *varyings,
                 unsigned *varying_count)
{
        *varying_count = 0;

        unsigned comps[PAN_MAX_VARYINGS] = { 0 };

        nir_foreach_variable_with_modes(var, s, varying_mode) {
                unsigned loc = var->data.driver_location;
                const struct glsl_type *column =
                        glsl_without_array_or_matrix(var->type);
                unsigned chan = glsl_get_components(column);

                /* If we have a fractional location added, we need to increase the size
                 * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
                 * We could do better but this is an edge case as it is, normally
                 * packed varyings will be aligned.
                 */
                chan += var->data.location_frac;
                comps[loc] = MAX2(comps[loc], chan);
        }

        nir_foreach_variable_with_modes(var, s, varying_mode) {
                unsigned loc = var->data.driver_location;
                unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
                const struct glsl_type *column =
                        glsl_without_array_or_matrix(var->type);
                enum glsl_base_type base_type = glsl_get_base_type(column);
                unsigned chan = comps[loc];

                nir_alu_type type = nir_get_nir_type_for_glsl_base_type(base_type);
                type = nir_alu_type_get_base_type(type);

                /* Can't do type conversion since GLSL IR packs in funny ways */
                if (PAN_ARCH >= 6 && var->data.interpolation == INTERP_MODE_FLAT)
                        type = nir_type_uint;

                /* Demote to fp16 where possible. int16 varyings are TODO as the hw
                 * will saturate instead of wrap which is not conformant, so we need to
                 * insert i2i16/u2u16 instructions before the st_vary_32i/32u to get
                 * the intended behaviour.
                 */
                if (type == nir_type_float &&
                    (var->data.precision == GLSL_PRECISION_MEDIUM ||
                     var->data.precision == GLSL_PRECISION_LOW) &&
                    !s->info.has_transform_feedback_varyings) {
                        type |= 16;
                } else {
                        type |= 32;
                }

                enum pipe_format format = varying_format(type, chan);
                assert(format != PIPE_FORMAT_NONE);

                for (int c = 0; c < sz; ++c) {
                        assert(loc + c < PAN_MAX_VARYINGS);
                        varyings[loc + c].location = var->data.location + c;
                        varyings[loc + c].format = format;
                }

                *varying_count = MAX2(*varying_count, loc + sz);
        }
}

#if PAN_ARCH >= 6
static enum mali_register_file_format
bifrost_blend_type_from_nir(nir_alu_type nir_type)
{
        switch(nir_type) {
        case 0: /* Render target not in use */
                return 0;
        case nir_type_float16:
                return MALI_REGISTER_FILE_FORMAT_F16;
        case nir_type_float32:
                return MALI_REGISTER_FILE_FORMAT_F32;
        case nir_type_int32:
                return MALI_REGISTER_FILE_FORMAT_I32;
        case nir_type_uint32:
                return MALI_REGISTER_FILE_FORMAT_U32;
        case nir_type_int16:
                return MALI_REGISTER_FILE_FORMAT_I16;
        case nir_type_uint16:
                return MALI_REGISTER_FILE_FORMAT_U16;
        default:
                unreachable("Unsupported blend shader type for NIR alu type");
                return 0;
        }
}
#endif

void
GENX(pan_shader_compile)(nir_shader *s,
                         struct panfrost_compile_inputs *inputs,
                         struct util_dynarray *binary,
                         struct pan_shader_info *info)
{
        memset(info, 0, sizeof(*info));

#if PAN_ARCH >= 6
        bifrost_compile_shader_nir(s, inputs, binary, info);
#else
        for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) {
                enum pipe_format fmt = inputs->rt_formats[i];
                unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;

                if (wb_fmt < MALI_COLOR_FORMAT_R8)
                        inputs->raw_fmt_mask |= BITFIELD_BIT(i);
        }

        midgard_compile_shader_nir(s, inputs, binary, info);
#endif

        info->stage = s->info.stage;
        info->contains_barrier = s->info.uses_memory_barrier ||
                                 s->info.uses_control_barrier;
        info->separable = s->info.separate_shader;

        switch (info->stage) {
        case MESA_SHADER_VERTEX:
                info->attribute_count = util_bitcount64(s->info.inputs_read);

#if PAN_ARCH <= 5
                bool vertex_id = BITSET_TEST(s->info.system_values_read,
                                             SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
                if (vertex_id)
                        info->attribute_count = MAX2(info->attribute_count, PAN_VERTEX_ID + 1);

                bool instance_id = BITSET_TEST(s->info.system_values_read,
                                               SYSTEM_VALUE_INSTANCE_ID);
                if (instance_id)
                        info->attribute_count = MAX2(info->attribute_count, PAN_INSTANCE_ID + 1);
#endif

                info->vs.writes_point_size =
                        s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
                collect_varyings(s, nir_var_shader_out, info->varyings.output,
                                 &info->varyings.output_count);
                break;
        case MESA_SHADER_FRAGMENT:
                if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
                        info->fs.writes_depth = true;
                if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
                        info->fs.writes_stencil = true;
                if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK))
                        info->fs.writes_coverage = true;

                info->fs.outputs_read = s->info.outputs_read >> FRAG_RESULT_DATA0;
                info->fs.outputs_written = s->info.outputs_written >> FRAG_RESULT_DATA0;

                /* EXT_shader_framebuffer_fetch requires per-sample */
                info->fs.sample_shading = s->info.fs.uses_sample_shading ||
                                          info->fs.outputs_read;

                info->fs.can_discard = s->info.fs.uses_discard;
                info->fs.helper_invocations = s->info.fs.needs_quad_helper_invocations;
                info->fs.early_fragment_tests = s->info.fs.early_fragment_tests;

                /* List of reasons we need to execute frag shaders when things
                 * are masked off */

                info->fs.sidefx = s->info.writes_memory ||
                                  s->info.fs.uses_discard ||
                                  s->info.fs.uses_demote;

                /* With suitable ZSA/blend, is early-z possible? */
                info->fs.can_early_z =
                        !info->fs.sidefx &&
                        !info->fs.writes_depth &&
                        !info->fs.writes_stencil &&
                        !info->fs.writes_coverage;

                /* Similiarly with suitable state, is FPK possible? */
                info->fs.can_fpk =
                        !info->fs.writes_depth &&
                        !info->fs.writes_stencil &&
                        !info->fs.writes_coverage &&
                        !info->fs.can_discard &&
                        !info->fs.outputs_read;

                info->fs.reads_frag_coord =
                        (s->info.inputs_read & (1 << VARYING_SLOT_POS)) ||
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
                info->fs.reads_point_coord =
                        s->info.inputs_read & (1 << VARYING_SLOT_PNTC);
                info->fs.reads_face =
                        (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
                info->fs.reads_sample_id =
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID);
                info->fs.reads_sample_pos =
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
                info->fs.reads_sample_mask_in =
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
                info->fs.reads_helper_invocation =
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION);
                collect_varyings(s, nir_var_shader_in, info->varyings.input,
                                 &info->varyings.input_count);
                break;
        case MESA_SHADER_COMPUTE:
                info->wls_size = s->info.shared_size;
                break;
        default:
                unreachable("Unknown shader state");
        }

        info->outputs_written = s->info.outputs_written;

        /* Sysvals have dedicated UBO */
        if (info->sysvals.sysval_count)
                info->ubo_count = MAX2(s->info.num_ubos + 1, inputs->sysval_ubo + 1);
        else
                info->ubo_count = s->info.num_ubos;

        info->attribute_count += util_last_bit(s->info.images_used);
        info->writes_global = s->info.writes_memory;

        info->sampler_count = info->texture_count = BITSET_LAST_BIT(s->info.textures_used);

#if PAN_ARCH >= 6
        /* This is "redundant" information, but is needed in a draw-time hot path */
        for (unsigned i = 0; i < ARRAY_SIZE(info->bifrost.blend); ++i) {
                info->bifrost.blend[i].format =
                        bifrost_blend_type_from_nir(info->bifrost.blend[i].type);
        }
#endif
}