dca3-game/vendor/koshle/pvr_internal.h

/* KallistiOS ##version##

   pvr_internal.h
   Copyright (C) 2002, 2003, 2004 Megan Potter

 */

#pragma once

#include <cstdint>
#include <cassert>

uint32_t pvr_map32(uint32_t offset32);

struct b32_uint32 {
    uint32_t data;
    void operator=(uint32_t data) {
        size_t offs = (uint8_t*)this - emu_vram;
        assert(!(offs&3));
        assert(offs<PVR_RAM_SIZE);

        *(uint32_t*)&emu_vram[pvr_map32(offs)] = data;
    }
    void operator|=(uint32_t data) {
        size_t offs = (uint8_t*)this - emu_vram;
        assert(!(offs&3));
        assert(offs<PVR_RAM_SIZE);

        *(uint32_t*)&emu_vram[pvr_map32(offs)] |= data;
    }
};
static_assert(sizeof(b32_uint32) == 4, "b32_uint32 size mismatch");


#ifndef __PVR_INTERNAL_H
#define __PVR_INTERNAL_H

/* Various implementation details are contained in here; this should only ever
   be included by modules in this directory.

   Everything from here down is considered internal to the implementation
   and may change without notice. So please don't rely on it in your
   code. If something is needed from this, an external interface should
   be added to dc/pvr.h. */

#include "kos/sem.h"
// #include <kos/mutex.h>

/**** State stuff ***************************************************/

/* The internal workings of the PVR2 are quite complex, and thank goodness
   we have the TA to help us with this setup process for each frame, or
   it'd be a LOT more work!

   Basically you have three different sets of buffers while registering
   scene data:

   1) Vertex buffer: this is a PVR RAM buffer that holds processed vertex
      data as it is fed to the TA
   2) Object pointer buffer: this is essentially an array of lists which
      holds data about which objects appear may appear in which tiles; for
      some odd reason, it grows down (probably so you don't have to pre-size
      vertex and OPB buffers, kinda like heap and stack)
   3) Tile matrix: this has a fixed-size entry for each tile on that will
      be rendered to; each active list must have a pointer into the OPB,
      or an "end of list" marker to mark it as have no OPB space

   As the TA collects data, the buffers may start to overflow if you have
   a lot of polygons, and that's what the grow space is about. It is
   initially using a fairly small amount of PVR RAM to hold the data
   structures, but as it overflows the bins for each tile, it must
   allocate a new block.

   3D processing proceeds in a pipeline fashion. There are four functional
   units we have to consider in this process: the main CPU, the tile
   accelerator, the ISP/TSP, and the visual output.

   If vertex DMA is enabled, then the TA may optionally be fed by the CPU,
   which will free it up from stalls that may happen with certain polygons
   when feeding the TA, as well as enabling other benefits.

   So in an ideal situation with no DMA enabled, it looks like this:

   VBlanks  SH4-to-TA   ISP/TSP         View
   0        ->T0        -           -
   1        ->T1        T0->F0          -
   2        ->T0        T1->F1          F0
   3        ->T1        T0->F0          F1
   ...

   When vertex DMA is enabled, we go into a naive 3-stage setup. This can
   be improved later, but it's a start for now.

   In this mode, we augment the timing diagram above:

   VBlanks  SH4-to-RAM  DMA-to-TA   ISP/TSP         View
   0        ->R0        -       -           -
   1        ->R1        R0->T0      -           -
   2        ->R0        R1->T1      T0->F0          -
   3        ->R1        R0->T0      T1->F1          F0
   4        ->R0        R1->T1      T0->F0          F1
   ...

   In the current naive implementation, everything is timed off of vblank
   interrupts. So the program can write vertices to the RAM buffers as long
   as it wants. On the first vblank where the current RAM buffers are filled
   up, DMA proceeds from the filled buffer to the TA. On the first vblank
   where all the TA transfers have completed, ISP/TSP rendering is started.
   On the first vblank where a frame has been completed, the view is switched
   to the frame. Thus everything sort of cascades in natural order when it's
   ready. This also solves the issue in previous versions where one would
   write a single frame and it'd never show up unless you push through
   several more frames. For example, a single frame written would look
   like this:

   VBlanks  SH4-to-RAM  DMA-to-TA   ISP/TSP         View
   0        ->R0        -       -           -
   1        -       R0->T0      -           -
   2        -       -       T0->F0          -
   3        -       -       -           F0

   Another example, if the CPU spent more than 16msec generating data in the
   SH4-to-RAM phase, it might look like this at 30fps:

   VBlanks  SH4-to-RAM  DMA-to-TA   ISP/TSP         View
   0        ->R0        -       -           -
   1        -       R0->T0      -           -
   2        ->R1        -       T0->F0          -
   3        -       R1->T1      -           F0
   4        ->R0        -       T1->F1          F0
   5        -       R0->T0      -           F1
   6        ->R1        -       T0->F0          F1
   ...

   Note that in the case where the potentially bigger frames cause the DMA-to-TA
   or ISP/TSP phases to take longer than one frame, they are allowed to expand
   into the next slot gracefully.

 */

/* Note that these must match the list types in pvr.h; these are here
   mainly because they're easier to type =) */
#define PVR_OPB_OP      0   /* Array indices for these structures */
#define PVR_OPB_OM      1
#define PVR_OPB_TP      2
#define PVR_OPB_TM      3
#define PVR_OPB_PT      4
#define PVR_OPB_COUNT   5

// TA buffers structure: we have two sets of these
typedef struct {
    uint32  vertex, vertex_size;            /* Vertex buffer */
    uint32  opb, opb_size;                  /* Object pointer buffers, size */
    uint32  opb_addresses[PVR_OPB_COUNT];        /* Object pointer buffers (of each type) */
    uint32  tile_matrix, tile_matrix_size;  /* Tile matrix, size */
    uint32  opb_overflow_count;             /* Extra OPB space after opb_size for TA overflow */
} pvr_ta_buffers_t;

// DMA buffers structure: we have two sets of these
typedef struct {
    uint8   * base[PVR_OPB_COUNT];  // DMA buffers, if assigned
    uint32  ptr[PVR_OPB_COUNT];     // DMA buffer write pointer, if used
    uint32  size[PVR_OPB_COUNT];    // DMA buffer sizes, or zero if none
    int ready;                      // >0 if these buffers are ready to be DMAed
} pvr_dma_buffers_t;

// Frame buffers structure: we have two sets of these
typedef struct {
    uint32  frame, frame_size;      // Output frame buffer, size
} pvr_frame_buffers_t;

/* PVR status structure; not only will this hold status information,
   but it will also server as the wait object for the frame-complete
   genwaits. */
typedef struct {
    // If this is zero, then this state isn't valid
    int     valid;

    // General configuration
    uint32  lists_enabled;              // opb_completed's value when we're ready to render
    uint32  list_reg_mask;              // Active lists register mask
    int     dma_mode;                   // 1 if we are using DMA to transfer vertices
    int     opb_size[PVR_OPB_COUNT];    // opb size flags

    // Pipeline state
    int     ram_target;                 // RAM buffer we're writing into
                                        // (^1 == RAM buffer we're DMAing from)
    int     ta_target;                  // TA buffer we're writing (or DMAing) into
                                        // (^1 == TA buffer we're rendering from)
    int     view_target;                // Frame buffer we're viewing
                                        // (^1 == frame buffer we're rendering to)

    int     list_reg_open;              // Which list is open for registration, if any? (non-DMA only)
    uint32  lists_closed;               // (1 << idx) for each list which the SH4 has lost interest in
    uint32  lists_transferred;          // (1 << idx) for each list which has completely transferred to the TA
    uint32  lists_dmaed;                // (1 << idx) for each list which has been DMA'd (DMA mode only)

   //  mutex_t dma_lock;                   // Locked if a DMA is in progress (vertex or texture)
    int     ta_busy;                    // >0 if a DMA is in progress and the TA hasn't signaled completion
    int     render_busy;                // >0 if a render is in progress
    int     render_completed;           // >1 if a render has recently finished

    // Memory pointers / buffers
    pvr_dma_buffers_t   dma_buffers[2];     // DMA buffers (if any)
    pvr_ta_buffers_t    ta_buffers[2];      // TA buffers
    pvr_frame_buffers_t frame_buffers[2];   // Frame buffers
    uint32              texture_base;       // Start of texture RAM

    // Screen size / clipping constants
    int     w, h;                       // Screen width, height
    int     tw, th;                     // Screen tile width, height
    uint32  tsize_const;                // Screen tile size constant
    float   zclip;                      // Z clip plane
    uint32  pclip_left, pclip_right;    // X pixel clip constants
    uint32  pclip_top, pclip_bottom;    // Y pixel clip constants
    uint32  pclip_x, pclip_y;           // Composited clip constants
    uint32  bg_color;                   // Background color in ARGB format

    /* Running statistics on the PVR system. All vars are in terms
       of nanoseconds. */
    uint64_t frame_last_time;            // When did the last frame completion occur?
    uint64_t buf_start_time;             // When did the last DMA buffer fill begin?
    uint64_t reg_start_time;             // When did the last registration begin?
    uint64_t rnd_start_time;             // When did the last render begin?
    uint64_t frame_last_len;             // VBlank-to-VBlank length for the last frame (1.0/FrameRate)
    uint64_t buf_last_len;               // Cumulative buffer fill time for the last frame
    uint64_t reg_last_len;               // Registration time for the last frame
    uint64_t rnd_last_len;               // Render time for the last frame
    size_t   vbl_count;                  // VBlank counter for animations and such
    size_t   frame_count;                // Total number of viewed frames
    size_t   vtx_buf_used;               // Vertex buffer used size for the last frame
    size_t   vtx_buf_used_max;           // Maximum used vertex buffer size

    /* Wait-ready semaphore: this will be signaled whenever the pvr_wait_ready()
       call should be ready to return. */
    semaphore_t ready_sem;

    // Handle for the vblank interrupt
    int     vbl_handle;

    // Non-zero if FSAA was enabled at init time.
    int     fsaa;

    // Non-zero if we are rendering to a texture
    int     to_texture[2];

    // Render pitch for to-texture mode
    int     to_txr_rp[2];

    // Output address for to-texture mode
    uint32  to_txr_addr[2];

    uint32  dr_used;

    // Callback to call before the start of rendering, may be NULL
    pvr_before_render_hook_t isp_start_callback;
} pvr_state_t;

/* There will be exactly one of these in KOS (in pvr_globals.c) */
extern volatile pvr_state_t pvr_state;

/* Background plane structure */
typedef struct pvr_bkg_poly {
    uint32      flags1, flags2;
    uint32      dummy;
    float       x1, y1, z1;
    uint32      argb1;
    float       x2, y2, z2;
    uint32      argb2;
    float       x3, y3, z3;
    uint32      argb3;
} pvr_bkg_poly_t;

// Debug macro, for debugging IRQ wackiness
#define DBG(x) do { \
        int o = irq_disable(); \
        printf x; \
        irq_restore(o); \
    } while(0)

/**** pvr_buffers.c ***************************************************/

/* Initialize buffers for TA/ISP/TSP usage */
void pvr_allocate_buffers(pvr_init_params_t *params);

/* Fill the tile matrices (after it's initialized) */
void pvr_init_tile_matrices(int presort);


/**** pvr_misc.c ******************************************************/

/* What event is happening (for pvr_sync_stats)? */
#define PVR_SYNC_VBLANK     1   /* VBlank IRQ */
#define PVR_SYNC_BUFSTART   2   /* DMA buffer fill started */
#define PVR_SYNC_BUFDONE    3   /* DMA buffer fill complete */
#define PVR_SYNC_REGSTART   4   /* Registration started */
#define PVR_SYNC_REGDONE    5   /* Registration complete */
#define PVR_SYNC_RNDSTART   6   /* Render started */
#define PVR_SYNC_RNDDONE    7   /* Render complete IRQ */
#define PVR_SYNC_PAGEFLIP   8   /* View page was flipped */

/* Update statistical counters */
void pvr_sync_stats(int event);

/* Synchronize the viewed page with what's in pvr_state */
void pvr_sync_view(void);

/* Synchronize the registration buffer with what's in pvr_state */
void pvr_sync_reg_buffer(void);

/* Begin a render operation that has been queued completely */
void pvr_begin_queued_render(void);

/* Generate synthetic polygon headers for the given list type (to submit
   blank lists that the user forgot) */
void pvr_blank_polyhdr(int type);

/* Same as above, but generates into a buffer instead of submitting. */
void pvr_blank_polyhdr_buf(int type, pvr_poly_hdr_t * buf);


/**** pvr_irq.c *******************************************************/

/* Interrupt handler for PVR events */
void pvr_int_handler(uint32 code, void *data);


#endif