// Copyright (c) 2012- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. // Ideas for speeding things up on mobile OpenGL ES implementations // // Use superbuffers! Yes I just invented that name. // // The idea is to avoid respecifying the vertex format between every draw call (multiple glVertexAttribPointer ...) // by combining the contents of multiple draw calls into one buffer, as long as // they have exactly the same output vertex format. (different input formats is fine! This way // we can combine the data for multiple draws with different numbers of bones, as we consider numbones < 4 to be = 4) // into one VBO. // // This will likely be a win because I believe that between every change of VBO + glVertexAttribPointer*N, the driver will // perform a lot of validation, probably at draw call time, while all the validation can be skipped if the only thing // that changes between two draw calls is simple state or texture or a matrix etc, not anything vertex related. // Also the driver will have to manage hundreds instead of thousands of VBOs in games like GTA. // // * Every 10 frames or something, do the following: // - Frame 1: // + Mark all drawn buffers with in-frame sequence numbers (alternatively, // just log them in an array) // - Frame 2 (beginning?): // + Take adjacent buffers that have the same output vertex format, and add them // to a list of buffers to combine. Create said buffers with appropriate sizes // and precompute the offsets that the draws should be written into. // - Frame 2 (end): // + Actually do the work of combining the buffers. This probably means re-decoding // the vertices into a new one. Will also have to apply index offsets. // // Also need to change the drawing code so that we don't glBindBuffer and respecify glVAP if // two subsequent drawcalls come from the same superbuffer. // // Or we ignore all of this including vertex caching and simply find a way to do highly optimized vertex streaming, // like Dolphin is trying to. That will likely never be able to reach the same speed as perfectly optimized // superbuffers though. For this we will have to JIT the vertex decoder but that's not too hard. // // Now, when do we delete superbuffers? Maybe when half the buffers within have been killed? // // Another idea for GTA which switches textures a lot while not changing much other state is to use ES 3 Array // textures, if they are the same size (even if they aren't, might be okay to simply resize the textures to match // if they're just a multiple of 2 away) or something. Then we'd have to add a W texture coordinate to choose the // texture within the bound texture array to the vertex data when merging into superbuffers. // // There are even more things to try. For games that do matrix palette skinning by quickly switching bones and // just drawing a few triangles per call (NBA, FF:CC, Tekken 6 etc) we could even collect matrices, upload them // all at once, writing matrix indices into the vertices in addition to the weights, and then doing a single // draw call with specially generated shader to draw the whole mesh. This code will be seriously complex though. #include "base/logging.h" #include "base/timeutil.h" #include "Common/MemoryUtil.h" #include "Core/MemMap.h" #include "Core/Host.h" #include "Core/System.h" #include "Core/Reporting.h" #include "Core/Config.h" #include "Core/CoreTiming.h" #include "helper/dx_state.h" #include "GPU/Math3D.h" #include "GPU/GPUState.h" #include "GPU/ge_constants.h" #include "GPU/Common/TextureDecoder.h" #include "GPU/Common/SplineCommon.h" #include "GPU/Common/TransformCommon.h" #include "GPU/Directx9/StateMappingDX9.h" #include "GPU/Directx9/TextureCacheDX9.h" #include "GPU/Directx9/TransformPipelineDX9.h" #include "GPU/Directx9/VertexDecoderDX9.h" #include "GPU/Directx9/ShaderManagerDX9.h" #include "GPU/Directx9/GPU_DX9.h" namespace DX9 { const D3DPRIMITIVETYPE glprim[8] = { D3DPT_POINTLIST, D3DPT_LINELIST, D3DPT_LINESTRIP, D3DPT_TRIANGLELIST, D3DPT_TRIANGLESTRIP, D3DPT_TRIANGLEFAN, D3DPT_TRIANGLELIST, // With OpenGL ES we have to expand sprites into triangles, tripling the data instead of doubling. sigh. OpenGL ES, Y U NO SUPPORT GL_QUADS? }; // hrydgard's quick guesses - TODO verify static const int D3DPRIMITIVEVERTEXCOUNT[8][2] = { {0, 0}, // invalid {1, 0}, // 1 = D3DPT_POINTLIST, {2, 0}, // 2 = D3DPT_LINELIST, {2, 1}, // 3 = D3DPT_LINESTRIP, {3, 0}, // 4 = D3DPT_TRIANGLELIST, {1, 2}, // 5 = D3DPT_TRIANGLESTRIP, {1, 2}, // 6 = D3DPT_TRIANGLEFAN, }; int D3DPrimCount(D3DPRIMITIVETYPE prim, int size) { return (size / D3DPRIMITIVEVERTEXCOUNT[prim][0]) - D3DPRIMITIVEVERTEXCOUNT[prim][1]; } enum { VERTEX_BUFFER_MAX = 65536, DECODED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * 48, DECODED_INDEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * 20, TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex) }; #define QUAD_INDICES_MAX 32768 #define VERTEXCACHE_DECIMATION_INTERVAL 17 // Check for max first as clamping to max is more common than min when lighting. inline float clamp(float in, float min, float max) { return in > max ? max : (in < min ? min : in); } TransformDrawEngineDX9::TransformDrawEngineDX9() : collectedVerts(0), prevPrim_(GE_PRIM_INVALID), dec_(0), lastVType_(-1), shaderManager_(0), textureCache_(0), framebufferManager_(0), numDrawCalls(0), vertexCountInDrawCalls(0), uvScale(0) { decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; // Allocate nicely aligned memory. Maybe graphics drivers will // appreciate it. // All this is a LOT of memory, need to see if we can cut down somehow. decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE); decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE); transformed = (TransformedVertex *)AllocateMemoryPages(TRANSFORMED_VERTEX_BUFFER_SIZE); transformedExpanded = (TransformedVertex *)AllocateMemoryPages(3 * TRANSFORMED_VERTEX_BUFFER_SIZE); quadIndices_ = new u16[6 * QUAD_INDICES_MAX]; for (int i = 0; i < QUAD_INDICES_MAX; i++) { quadIndices_[i * 6 + 0] = i * 4; quadIndices_[i * 6 + 1] = i * 4 + 2; quadIndices_[i * 6 + 2] = i * 4 + 1; quadIndices_[i * 6 + 3] = i * 4 + 1; quadIndices_[i * 6 + 4] = i * 4 + 2; quadIndices_[i * 6 + 5] = i * 4 + 3; } if (g_Config.bPrescaleUV) { uvScale = new UVScale[MAX_DEFERRED_DRAW_CALLS]; } indexGen.Setup(decIndex); InitDeviceObjects(); } TransformDrawEngineDX9::~TransformDrawEngineDX9() { DestroyDeviceObjects(); FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE); FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE); FreeMemoryPages(transformed, TRANSFORMED_VERTEX_BUFFER_SIZE); FreeMemoryPages(transformedExpanded, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE); for (auto decl = vertexDeclMap_.begin(); decl != vertexDeclMap_.end(); ++decl) { if (decl->second) { decl->second->Release(); } } delete [] quadIndices_; for (auto iter = decoderMap_.begin(); iter != decoderMap_.end(); iter++) { delete iter->second; } delete [] uvScale; } void TransformDrawEngineDX9::InitDeviceObjects() { } void TransformDrawEngineDX9::DestroyDeviceObjects() { ClearTrackedVertexArrays(); } struct DeclTypeInfo { u32 type; const char * name; }; static const DeclTypeInfo VComp[] = { {0, "NULL"}, // DEC_NONE, {D3DDECLTYPE_FLOAT1 ,"D3DDECLTYPE_FLOAT1 "}, // DEC_FLOAT_1, {D3DDECLTYPE_FLOAT2 ,"D3DDECLTYPE_FLOAT2 "}, // DEC_FLOAT_2, {D3DDECLTYPE_FLOAT3 ,"D3DDECLTYPE_FLOAT3 "}, // DEC_FLOAT_3, {D3DDECLTYPE_FLOAT4 ,"D3DDECLTYPE_FLOAT4 "}, // DEC_FLOAT_4, // Not supported in regular DX9 so faking, will cause graphics bugs until worked around {D3DDECLTYPE_UBYTE4 ,"D3DDECLTYPE_BYTE4N "}, // DEC_S8_3, {D3DDECLTYPE_SHORT4N ,"D3DDECLTYPE_SHORT4N "}, // DEC_S16_3, {D3DDECLTYPE_UBYTE4N ,"D3DDECLTYPE_UBYTE4N "}, // DEC_U8_1, {D3DDECLTYPE_UBYTE4N ,"D3DDECLTYPE_UBYTE4N "}, // DEC_U8_2, {D3DDECLTYPE_UBYTE4N ,"D3DDECLTYPE_UBYTE4N "}, // DEC_U8_3, {D3DDECLTYPE_UBYTE4N ,"D3DDECLTYPE_UBYTE4N "}, // DEC_U8_4, {D3DDECLTYPE_USHORT2N, "D3DDECLTYPE_USHORT2N " }, // DEC_U16_1, {D3DDECLTYPE_USHORT2N, "D3DDECLTYPE_USHORT2N " }, // DEC_U16_2, {D3DDECLTYPE_USHORT4N ,"D3DDECLTYPE_USHORT4N "}, // DEC_U16_3, {D3DDECLTYPE_USHORT4N ,"D3DDECLTYPE_USHORT4N "}, // DEC_U16_4, // Not supported in regular DX9 so faking, will cause graphics bugs until worked around {D3DDECLTYPE_UBYTE4 ,"D3DDECLTYPE_BYTE4 "}, // DEC_U8A_2, {D3DDECLTYPE_USHORT2N, "D3DDECLTYPE_USHORT4 " }, // DEC_U16A_2, }; static void VertexAttribSetup(D3DVERTEXELEMENT9 * VertexElement, u8 fmt, u8 offset, u8 usage, u8 usage_index = 0) { memset(VertexElement, 0, sizeof(D3DVERTEXELEMENT9)); VertexElement->Offset = offset; VertexElement->Type = VComp[fmt].type; VertexElement->Usage = usage; VertexElement->UsageIndex = usage_index; } // TODO: Use VBO and get rid of the vertexData pointers - with that, we will supply only offsets static void LogDecFmtForDraw(const DecVtxFormat &decFmt) { // Vertices Elements orders // WEIGHT if (decFmt.w0fmt != 0) { printf("decFmt.w0fmt -> %s (%d)\n", VComp[decFmt.w0fmt].name, decFmt.w0off); } if (decFmt.w1fmt != 0) { printf("decFmt.w1fmt -> %s (%d)\n", VComp[decFmt.w1fmt].name, decFmt.w1off); } // TC if (decFmt.uvfmt != 0) { printf("decFmt.uvfmt -> %s (%d)\n", VComp[decFmt.uvfmt].name, decFmt.uvoff); } // COLOR if (decFmt.c0fmt != 0) { printf("decFmt.c0fmt -> %s (%d)\n", VComp[decFmt.c0fmt].name, decFmt.c0off); } // NORMAL if (decFmt.nrmfmt != 0) { printf("decFmt.nrmfmt -> %s (%d)\n", VComp[decFmt.nrmfmt].name, decFmt.nrmoff); } // POSITION // Always printf("decFmt.posfmt -> %s (%d)\n", VComp[decFmt.posfmt].name, decFmt.posoff); printf("decFmt.stride => %d\n", decFmt.stride); //pD3Ddevice->SetRenderState(D3DRS_FILLMODE, D3DFILL_WIREFRAME); } IDirect3DVertexDeclaration9 *TransformDrawEngineDX9::SetupDecFmtForDraw(VSShader *vshader, const DecVtxFormat &decFmt, u32 pspFmt) { auto vertexDeclCached = vertexDeclMap_.find(pspFmt); if (vertexDeclCached == vertexDeclMap_.end()) { D3DVERTEXELEMENT9 VertexElements[8]; D3DVERTEXELEMENT9 *VertexElement = &VertexElements[0]; // Vertices Elements orders // WEIGHT if (decFmt.w0fmt != 0) { VertexAttribSetup(VertexElement, decFmt.w0fmt, decFmt.w0off, D3DDECLUSAGE_BLENDWEIGHT, 0); VertexElement++; } if (decFmt.w1fmt != 0) { VertexAttribSetup(VertexElement, decFmt.w1fmt, decFmt.w1off, D3DDECLUSAGE_BLENDWEIGHT, 1); VertexElement++; } // TC if (decFmt.uvfmt != 0) { VertexAttribSetup(VertexElement, decFmt.uvfmt, decFmt.uvoff, D3DDECLUSAGE_TEXCOORD, 0); VertexElement++; } // COLOR if (decFmt.c0fmt != 0) { VertexAttribSetup(VertexElement, decFmt.c0fmt, decFmt.c0off, D3DDECLUSAGE_COLOR, 0); VertexElement++; } // Never used ? if (decFmt.c1fmt != 0) { VertexAttribSetup(VertexElement, decFmt.c1fmt, decFmt.c1off, D3DDECLUSAGE_COLOR, 1); VertexElement++; } // NORMAL if (decFmt.nrmfmt != 0) { VertexAttribSetup(VertexElement, decFmt.nrmfmt, decFmt.nrmoff, D3DDECLUSAGE_NORMAL, 0); VertexElement++; } // POSITION // Always VertexAttribSetup(VertexElement, decFmt.posfmt, decFmt.posoff, D3DDECLUSAGE_POSITION, 0); VertexElement++; // End D3DVERTEXELEMENT9 end = D3DDECL_END(); memcpy(VertexElement, &end, sizeof(D3DVERTEXELEMENT9)); // Create declaration IDirect3DVertexDeclaration9 *pHardwareVertexDecl = nullptr; HRESULT hr = pD3Ddevice->CreateVertexDeclaration( VertexElements, &pHardwareVertexDecl ); if (FAILED(hr)) { // Log LogDecFmtForDraw(decFmt); // DebugBreak(); } // Add it to map vertexDeclMap_[pspFmt] = pHardwareVertexDecl; return pHardwareVertexDecl; } else { // Set it from map return vertexDeclCached->second; } } // The verts are in the order: BR BL TL TR static void SwapUVs(TransformedVertex &a, TransformedVertex &b) { float tempu = a.u; float tempv = a.v; a.u = b.u; a.v = b.v; b.u = tempu; b.v = tempv; } // 2 3 3 2 0 3 2 1 // to to or // 1 0 0 1 1 2 3 0 // See comment below where this was called before. /* static void RotateUV(TransformedVertex v[4]) { float x1 = v[2].x; float x2 = v[0].x; float y1 = v[2].y; float y2 = v[0].y; if ((x1 < x2 && y1 < y2) || (x1 > x2 && y1 > y2)) SwapUVs(v[1], v[3]); }*/ static void RotateUVThrough(TransformedVertex v[4]) { float x1 = v[2].x; float x2 = v[0].x; float y1 = v[2].y; float y2 = v[0].y; if ((x1 < x2 && y1 > y2) || (x1 > x2 && y1 < y2)) SwapUVs(v[1], v[3]); } // Clears on the PSP are best done by drawing a series of vertical strips // in clear mode. This tries to detect that. bool TransformDrawEngineDX9::IsReallyAClear(int numVerts) const { if (transformed[0].x != 0.0f || transformed[0].y != 0.0f) return false; u32 matchcolor; memcpy(&matchcolor, transformed[0].color0, 4); float matchz = transformed[0].z; int bufW = gstate_c.curRTWidth; int bufH = gstate_c.curRTHeight; float prevX = 0.0f; for (int i = 1; i < numVerts; i++) { u32 vcolor; memcpy(&vcolor, transformed[i].color0, 4); if (vcolor != matchcolor || transformed[i].z != matchz) return false; if ((i & 1) == 0) { // Top left of a rectangle if (transformed[i].y != 0) return false; if (i > 0 && transformed[i].x != transformed[i - 1].x) return false; } else { // Bottom right if (transformed[i].y != bufH) return false; if (transformed[i].x <= transformed[i - 1].x) return false; } } // The last vertical strip often extends outside the drawing area. if (transformed[numVerts - 1].x < bufW) return false; return true; } // This is the software transform pipeline, which is necessary for supporting RECT // primitives correctly, and may be easier to use for debugging than the hardware // transform pipeline. // There's code here that simply expands transformed RECTANGLES into plain triangles. // We're gonna have to keep software transforming RECTANGLES, unless we use a geom shader which we can't on OpenGL ES 2.0. // Usually, though, these primitives don't use lighting etc so it's no biggie performance wise, but it would be nice to get rid of // this code. // Actually, if we find the camera-relative right and down vectors, it might even be possible to add the extra points in pre-transformed // space and thus make decent use of hardware transform. // Actually again, single quads could be drawn more efficiently using GL_TRIANGLE_STRIP, no need to duplicate verts as for // GL_TRIANGLES. Still need to sw transform to compute the extra two corners though. void TransformDrawEngineDX9::SoftwareTransformAndDraw( int prim, u8 *decoded, int vertexCount, u32 vertType, void *inds, int indexType, const DecVtxFormat &decVtxFormat, int maxIndex) { bool throughmode = (vertType & GE_VTYPE_THROUGH_MASK) != 0; bool lmode = gstate.isUsingSecondaryColor() && gstate.isLightingEnabled(); // TODO: Split up into multiple draw calls for GLES 2.0 where you can't guarantee support for more than 0x10000 verts. float uscale = 1.0f; float vscale = 1.0f; if (throughmode) { uscale /= gstate_c.curTextureWidth; vscale /= gstate_c.curTextureHeight; } int w = gstate.getTextureWidth(0); int h = gstate.getTextureHeight(0); float widthFactor = (float) w / (float) gstate_c.curTextureWidth; float heightFactor = (float) h / (float) gstate_c.curTextureHeight; Lighter lighter(vertType); float fog_end = getFloat24(gstate.fog1); float fog_slope = getFloat24(gstate.fog2); VertexReader reader(decoded, decVtxFormat, vertType); for (int index = 0; index < maxIndex; index++) { reader.Goto(index); float v[3] = {0, 0, 0}; float c0[4] = {1, 1, 1, 1}; float c1[4] = {0, 0, 0, 0}; float uv[3] = {0, 0, 1}; float fogCoef = 1.0f; if (throughmode) { // Do not touch the coordinates or the colors. No lighting. reader.ReadPos(v); if (reader.hasColor0()) { reader.ReadColor0(c0); for (int j = 0; j < 4; j++) { c1[j] = 0.0f; } } else { c0[0] = gstate.getMaterialAmbientR() / 255.f; c0[1] = gstate.getMaterialAmbientG() / 255.f; c0[2] = gstate.getMaterialAmbientB() / 255.f; c0[3] = gstate.getMaterialAmbientA() / 255.f; } if (reader.hasUV()) { reader.ReadUV(uv); uv[0] *= uscale; uv[1] *= vscale; } fogCoef = 1.0f; // Scale UV? } else { // We do software T&L for now float out[3]; float pos[3]; Vec3f normal(0, 0, 1); Vec3f worldnormal(0, 0, 1); reader.ReadPos(pos); if (!vertTypeIsSkinningEnabled(vertType)) { Vec3ByMatrix43(out, pos, gstate.worldMatrix); if (reader.hasNormal()) { reader.ReadNrm(normal.AsArray()); if (gstate.areNormalsReversed()) { normal = -normal; } Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix); worldnormal = worldnormal.Normalized(); } } else { float weights[8]; reader.ReadWeights(weights); if (reader.hasNormal()) reader.ReadNrm(normal.AsArray()); // Skinning Vec3f psum(0,0,0); Vec3f nsum(0,0,0); for (int i = 0; i < vertTypeGetNumBoneWeights(vertType); i++) { if (weights[i] != 0.0f) { Vec3ByMatrix43(out, pos, gstate.boneMatrix+i*12); Vec3f tpos(out); psum += tpos * weights[i]; if (reader.hasNormal()) { Vec3f norm; Norm3ByMatrix43(norm.AsArray(), normal.AsArray(), gstate.boneMatrix+i*12); nsum += norm * weights[i]; } } } // Yes, we really must multiply by the world matrix too. Vec3ByMatrix43(out, psum.AsArray(), gstate.worldMatrix); if (reader.hasNormal()) { normal = nsum; if (gstate.areNormalsReversed()) { normal = -normal; } Norm3ByMatrix43(worldnormal.AsArray(), normal.AsArray(), gstate.worldMatrix); worldnormal = worldnormal.Normalized(); } } // Perform lighting here if enabled. don't need to check through, it's checked above. float unlitColor[4] = {1, 1, 1, 1}; if (reader.hasColor0()) { reader.ReadColor0(unlitColor); } else { unlitColor[0] = gstate.getMaterialAmbientR() / 255.f; unlitColor[1] = gstate.getMaterialAmbientG() / 255.f; unlitColor[2] = gstate.getMaterialAmbientB() / 255.f; unlitColor[3] = gstate.getMaterialAmbientA() / 255.f; } if (gstate.isLightingEnabled()) { float litColor0[4]; float litColor1[4]; lighter.Light(litColor0, litColor1, unlitColor, out, worldnormal); // Don't ignore gstate.lmode - we should send two colors in that case for (int j = 0; j < 4; j++) { c0[j] = litColor0[j]; } if (lmode) { // Separate colors for (int j = 0; j < 4; j++) { c1[j] = litColor1[j]; } } else { // Summed color into c0 for (int j = 0; j < 4; j++) { c0[j] = ((c0[j] + litColor1[j]) > 1.0f) ? 1.0f : (c0[j] + litColor1[j]); } } } else { if (reader.hasColor0()) { for (int j = 0; j < 4; j++) { c0[j] = unlitColor[j]; } } else { c0[0] = gstate.getMaterialAmbientR() / 255.f; c0[1] = gstate.getMaterialAmbientG() / 255.f; c0[2] = gstate.getMaterialAmbientB() / 255.f; c0[3] = gstate.getMaterialAmbientA() / 255.f; } if (lmode) { for (int j = 0; j < 4; j++) { c1[j] = 0.0f; } } } float ruv[2] = {0.0f, 0.0f}; if (reader.hasUV()) reader.ReadUV(ruv); // Perform texture coordinate generation after the transform and lighting - one style of UV depends on lights. switch (gstate.getUVGenMode()) { case GE_TEXMAP_TEXTURE_COORDS: // UV mapping case GE_TEXMAP_UNKNOWN: // Seen in Riviera. Unsure of meaning, but this works. // Texture scale/offset is only performed in this mode. uv[0] = uscale * (ruv[0]*gstate_c.uv.uScale + gstate_c.uv.uOff); uv[1] = vscale * (ruv[1]*gstate_c.uv.vScale + gstate_c.uv.vOff); uv[2] = 1.0f; break; case GE_TEXMAP_TEXTURE_MATRIX: { // Projection mapping Vec3f source; switch (gstate.getUVProjMode()) { case GE_PROJMAP_POSITION: // Use model space XYZ as source source = pos; break; case GE_PROJMAP_UV: // Use unscaled UV as source source = Vec3f(ruv[0], ruv[1], 0.0f); break; case GE_PROJMAP_NORMALIZED_NORMAL: // Use normalized normal as source source = normal.Normalized(); if (!reader.hasNormal()) { ERROR_LOG_REPORT(G3D, "Normal projection mapping without normal?"); } break; case GE_PROJMAP_NORMAL: // Use non-normalized normal as source! source = normal; if (!reader.hasNormal()) { ERROR_LOG_REPORT(G3D, "Normal projection mapping without normal?"); } break; } float uvw[3]; Vec3ByMatrix43(uvw, &source.x, gstate.tgenMatrix); uv[0] = uvw[0]; uv[1] = uvw[1]; uv[2] = uvw[2]; } break; case GE_TEXMAP_ENVIRONMENT_MAP: // Shade mapping - use two light sources to generate U and V. { Vec3f lightpos0 = Vec3f(&lighter.lpos[gstate.getUVLS0() * 3]).Normalized(); Vec3f lightpos1 = Vec3f(&lighter.lpos[gstate.getUVLS1() * 3]).Normalized(); uv[0] = (1.0f + Dot(lightpos0, worldnormal))/2.0f; uv[1] = (1.0f - Dot(lightpos1, worldnormal))/2.0f; uv[2] = 1.0f; } break; default: // Illegal ERROR_LOG_REPORT(G3D, "Impossible UV gen mode? %d", gstate.getUVGenMode()); break; } uv[0] = uv[0] * widthFactor; uv[1] = uv[1] * heightFactor; // Transform the coord by the view matrix. Vec3ByMatrix43(v, out, gstate.viewMatrix); fogCoef = (v[2] + fog_end) * fog_slope; } // TODO: Write to a flexible buffer, we don't always need all four components. memcpy(&transformed[index].x, v, 3 * sizeof(float)); transformed[index].fog = fogCoef; memcpy(&transformed[index].u, uv, 3 * sizeof(float)); if (gstate_c.flipTexture) { transformed[index].v = 1.0f - transformed[index].v; } for (int i = 0; i < 4; i++) { transformed[index].color0[i] = c0[i] * 255.0f; } for (int i = 0; i < 3; i++) { transformed[index].color1[i] = c1[i] * 255.0f; } } // Step 2: expand rectangles. const TransformedVertex *drawBuffer = transformed; int numTrans = 0; bool drawIndexed = false; if (prim != GE_PRIM_RECTANGLES) { // We can simply draw the unexpanded buffer. numTrans = vertexCount; drawIndexed = true; } else { numTrans = 0; drawBuffer = transformedExpanded; TransformedVertex *trans = &transformedExpanded[0]; TransformedVertex saved; u32 stencilValue; for (int i = 0; i < vertexCount; i += 2) { int index = ((const u16*)inds)[i]; saved = transformed[index]; int index2 = ((const u16*)inds)[i + 1]; TransformedVertex &transVtx = transformed[index2]; if (i == 0) stencilValue = transVtx.color0[3]; // We have to turn the rectangle into two triangles, so 6 points. Sigh. // bottom right trans[0] = transVtx; // bottom left trans[1] = transVtx; trans[1].y = saved.y; trans[1].v = saved.v; // top left trans[2] = transVtx; trans[2].x = saved.x; trans[2].y = saved.y; trans[2].u = saved.u; trans[2].v = saved.v; // top right trans[3] = transVtx; trans[3].x = saved.x; trans[3].u = saved.u; // That's the four corners. Now process UV rotation. if (throughmode) RotateUVThrough(trans); // Apparently, non-through RotateUV just breaks things. // If we find a game where it helps, we'll just have to figure out how they differ. // Possibly, it has something to do with flipped viewport Y axis, which a few games use. // One game might be one of the Metal Gear ones, can't find the issue right now though. // else // RotateUV(trans); // bottom right trans[4] = trans[0]; // top left trans[5] = trans[2]; trans += 6; numTrans += 6; } // We don't know the color until here, so we have to do it now, instead of in StateMapping. // Might want to reconsider the order of things later... if (gstate.isModeClear() && gstate.isClearModeAlphaMask()) { dxstate.stencilFunc.set(D3DCMP_ALWAYS, stencilValue, 255); } } // TODO: Add a post-transform cache here for multi-RECTANGLES only. // Might help for text drawing. // these spam the gDebugger log. const int vertexSize = sizeof(transformed[0]); pD3Ddevice->SetVertexDeclaration( pSoftVertexDecl ); /// Debug !! //pD3Ddevice->SetRenderState(D3DRS_FILLMODE, D3DFILL_WIREFRAME); if (drawIndexed) { pD3Ddevice->DrawIndexedPrimitiveUP(glprim[prim], 0, vertexCount, D3DPrimCount(glprim[prim], numTrans), inds, D3DFMT_INDEX16, drawBuffer, sizeof(TransformedVertex)); } else { pD3Ddevice->DrawPrimitiveUP(glprim[prim], D3DPrimCount(glprim[prim], numTrans), drawBuffer, sizeof(TransformedVertex)); } } VertexDecoderDX9 *TransformDrawEngineDX9::GetVertexDecoder(u32 vtype) { auto iter = decoderMap_.find(vtype); if (iter != decoderMap_.end()) return iter->second; VertexDecoderDX9 *dec = new VertexDecoderDX9(); dec->SetVertexType(vtype); decoderMap_[vtype] = dec; return dec; } void TransformDrawEngineDX9::SetupVertexDecoder(u32 vertType) { // If vtype has changed, setup the vertex decoder. // TODO: Simply cache the setup decoders instead. if (vertType != lastVType_) { dec_ = GetVertexDecoder(vertType); lastVType_ = vertType; } } int TransformDrawEngineDX9::EstimatePerVertexCost() { // TODO: This is transform cost, also account for rasterization cost somehow... although it probably // runs in parallel with transform. // Also, this is all pure guesswork. If we can find a way to do measurements, that would be great. // GTA wants a low value to run smooth, GoW wants a high value (otherwise it thinks things // went too fast and starts doing all the work over again). int cost = 20; if (gstate.isLightingEnabled()) { cost += 10; } for (int i = 0; i < 4; i++) { if (gstate.isLightChanEnabled(i)) cost += 10; } if (gstate.getUVGenMode() != GE_TEXMAP_TEXTURE_COORDS) { cost += 20; } if (dec_ && dec_->morphcount > 1) { cost += 5 * dec_->morphcount; } return cost; } void TransformDrawEngineDX9::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int forceIndexType, int *bytesRead) { if (vertexCount == 0) return; // we ignore zero-sized draw calls. if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls + vertexCount > VERTEX_BUFFER_MAX) Flush(); // TODO: Is this the right thing to do? if (prim == GE_PRIM_KEEP_PREVIOUS) { prim = prevPrim_; } prevPrim_ = prim; SetupVertexDecoder(vertType); dec_->IncrementStat(STAT_VERTSSUBMITTED, vertexCount); if (bytesRead) *bytesRead = vertexCount * dec_->VertexSize(); gpuStats.numDrawCalls++; gpuStats.numVertsSubmitted += vertexCount; DeferredDrawCall &dc = drawCalls[numDrawCalls]; dc.verts = verts; dc.inds = inds; dc.vertType = vertType; dc.indexType = ((forceIndexType == -1) ? (vertType & GE_VTYPE_IDX_MASK) : forceIndexType) >> GE_VTYPE_IDX_SHIFT; dc.prim = prim; dc.vertexCount = vertexCount; if (inds) { GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound); } else { dc.indexLowerBound = 0; dc.indexUpperBound = vertexCount - 1; } if (uvScale) { uvScale[numDrawCalls] = gstate_c.uv; } numDrawCalls++; vertexCountInDrawCalls += vertexCount; } void TransformDrawEngineDX9::DecodeVerts() { UVScale origUV; if (uvScale) origUV = gstate_c.uv; for (int i = 0; i < numDrawCalls; i++) { const DeferredDrawCall &dc = drawCalls[i]; indexGen.SetIndex(collectedVerts); int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; u32 indexType = dc.indexType; void *inds = dc.inds; if (indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) { // Decode the verts and apply morphing. Simple. if (uvScale) gstate_c.uv = uvScale[i]; dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride, dc.verts, indexLowerBound, indexUpperBound); collectedVerts += indexUpperBound - indexLowerBound + 1; indexGen.AddPrim(dc.prim, dc.vertexCount); } else { // It's fairly common that games issue long sequences of PRIM calls, with differing // inds pointer but the same base vertex pointer. We'd like to reuse vertices between // these as much as possible, so we make sure here to combine as many as possible // into one nice big drawcall, sharing data. // 1. Look ahead to find the max index, only looking as "matching" drawcalls. // Expand the lower and upper bounds as we go. int j = i + 1; int lastMatch = i; while (j < numDrawCalls) { if (drawCalls[j].verts != dc.verts) break; if (uvScale && memcmp(&uvScale[j], &uvScale[i], sizeof(uvScale[0])) != 0) break; indexLowerBound = std::min(indexLowerBound, (int)drawCalls[j].indexLowerBound); indexUpperBound = std::max(indexUpperBound, (int)drawCalls[j].indexUpperBound); lastMatch = j; j++; } // 2. Loop through the drawcalls, translating indices as we go. for (j = i; j <= lastMatch; j++) { switch (indexType) { case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT: indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u8 *)drawCalls[j].inds, indexLowerBound); break; case GE_VTYPE_IDX_16BIT >> GE_VTYPE_IDX_SHIFT: indexGen.TranslatePrim(drawCalls[j].prim, drawCalls[j].vertexCount, (const u16 *)drawCalls[j].inds, indexLowerBound); break; } } int vertexCount = indexUpperBound - indexLowerBound + 1; // 3. Decode that range of vertex data. if (uvScale) gstate_c.uv = uvScale[i]; dec_->DecodeVerts(decoded + collectedVerts * (int)dec_->GetDecVtxFmt().stride, dc.verts, indexLowerBound, indexUpperBound); collectedVerts += vertexCount; // 4. Advance indexgen vertex counter. indexGen.Advance(vertexCount); i = lastMatch; } } // Sanity check if (indexGen.Prim() < 0) { ERROR_LOG_REPORT(G3D, "DecodeVerts: Failed to deduce prim: %i", indexGen.Prim()); // Force to points (0) indexGen.AddPrim(GE_PRIM_POINTS, 0); } if (uvScale) gstate_c.uv = origUV; } u32 TransformDrawEngineDX9::ComputeHash() { u32 fullhash = 0; int vertexSize = dec_->GetDecVtxFmt().stride; // TODO: Add some caps both for numDrawCalls and num verts to check? // It is really very expensive to check all the vertex data so often. for (int i = 0; i < numDrawCalls; i++) { const DeferredDrawCall &dc = drawCalls[i]; if (!dc.inds) { fullhash += DoReliableHash((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4); } else { int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; int j = i + 1; int lastMatch = i; while (j < numDrawCalls) { if (drawCalls[j].verts != dc.verts) break; indexLowerBound = std::min(indexLowerBound, (int)dc.indexLowerBound); indexUpperBound = std::max(indexUpperBound, (int)dc.indexUpperBound); lastMatch = j; j++; } // This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way // we do when drawing. fullhash += DoReliableHash((const char *)dc.verts + vertexSize * indexLowerBound, vertexSize * (indexUpperBound - indexLowerBound), 0x029F3EE1); int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1; // Hm, we will miss some indices when combining above, but meh, it should be fine. fullhash += DoReliableHash((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA); i = lastMatch; } } if (uvScale) { fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658); } return fullhash; } u32 TransformDrawEngineDX9::ComputeFastDCID() { u32 hash = 0; for (int i = 0; i < numDrawCalls; i++) { hash ^= (u32)(uintptr_t)drawCalls[i].verts; hash = __rotl(hash, 13); hash ^= (u32)(uintptr_t)drawCalls[i].inds; hash = __rotl(hash, 13); hash ^= (u32)drawCalls[i].vertType; hash = __rotl(hash, 13); hash ^= (u32)drawCalls[i].vertexCount; hash = __rotl(hash, 13); hash ^= (u32)drawCalls[i].prim; } return hash; } enum { VAI_KILL_AGE = 120 }; void TransformDrawEngineDX9::ClearTrackedVertexArrays() { for (auto vai = vai_.begin(); vai != vai_.end(); vai++) { delete vai->second; } vai_.clear(); } void TransformDrawEngineDX9::DecimateTrackedVertexArrays() { if (--decimationCounter_ <= 0) { decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; } else { return; } int threshold = gpuStats.numFlips - VAI_KILL_AGE; for (auto iter = vai_.begin(); iter != vai_.end(); ) { if (iter->second->lastFrame < threshold) { delete iter->second; vai_.erase(iter++); } else ++iter; } // Enable if you want to see vertex decoders in the log output. Need a better way. #if 0 char buffer[16384]; for (std::map::iterator dec = decoderMap_.begin(); dec != decoderMap_.end(); ++dec) { char *ptr = buffer; ptr += dec->second->ToString(ptr); // *ptr++ = '\n'; NOTICE_LOG(G3D, buffer); } #endif } VertexArrayInfoDX9::~VertexArrayInfoDX9() { if (vbo) { vbo->Release(); } if (ebo) { ebo->Release(); } } void TransformDrawEngineDX9::DoFlush() { gpuStats.numFlushes++; gpuStats.numTrackedVertexArrays = (int)vai_.size(); // This is not done on every drawcall, we should collect vertex data // until critical state changes. That's when we draw (flush). GEPrimitiveType prim = prevPrim_; ApplyDrawState(prim); VSShader *vshader = shaderManager_->ApplyShader(prim, lastVType_); if (vshader->UseHWTransform()) { LPDIRECT3DVERTEXBUFFER9 vb_ = NULL; LPDIRECT3DINDEXBUFFER9 ib_ = NULL; int vertexCount = 0; int maxIndex = 0; bool useElements = true; // Cannot cache vertex data with morph enabled. if (g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK)) { u32 id = ComputeFastDCID(); auto iter = vai_.find(id); VertexArrayInfoDX9 *vai; if (iter != vai_.end()) { // We've seen this before. Could have been a cached draw. vai = iter->second; } else { vai = new VertexArrayInfoDX9(); vai_[id] = vai; } switch (vai->status) { case VertexArrayInfoDX9::VAI_NEW: { // Haven't seen this one before. u32 dataHash = ComputeHash(); vai->hash = dataHash; vai->status = VertexArrayInfoDX9::VAI_HASHING; vai->drawsUntilNextFullHash = 0; DecodeVerts(); // writes to indexGen vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); vai->maxIndex = indexGen.MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0; goto rotateVBO; } // Hashing - still gaining confidence about the buffer. // But if we get this far it's likely to be worth creating a vertex buffer. case VertexArrayInfoDX9::VAI_HASHING: { vai->numDraws++; if (vai->lastFrame != gpuStats.numFlips) { vai->numFrames++; } if (vai->drawsUntilNextFullHash == 0) { u32 newHash = ComputeHash(); if (newHash != vai->hash) { vai->status = VertexArrayInfoDX9::VAI_UNRELIABLE; if (vai->vbo) { vai->vbo->Release(); vai->vbo = NULL; } if (vai->ebo) { vai->ebo->Release(); vai->ebo = NULL; } DecodeVerts(); goto rotateVBO; } if (vai->numVerts > 100) { // exponential backoff up to 16 draws, then every 24 vai->drawsUntilNextFullHash = std::min(24, vai->numFrames); } else { // Lower numbers seem much more likely to change. vai->drawsUntilNextFullHash = 0; } // TODO: tweak //if (vai->numFrames > 1000) { // vai->status = VertexArrayInfo::VAI_RELIABLE; //} } else { vai->drawsUntilNextFullHash--; // TODO: "mini-hashing" the first 32 bytes of the vertex/index data or something. } if (vai->vbo == 0) { DecodeVerts(); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); vai->maxIndex = indexGen.MaxIndex(); useElements = !indexGen.SeenOnlyPurePrims(); if (!useElements && indexGen.PureCount()) { vai->numVerts = indexGen.PureCount(); } // Always if (1) { void * pVb; u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); pD3Ddevice->CreateVertexBuffer(size, NULL, NULL, D3DPOOL_DEFAULT, &vai->vbo, NULL); vai->vbo->Lock(0, size, &pVb, D3DLOCK_NOOVERWRITE ); memcpy(pVb, decoded, size); vai->vbo->Unlock(); } // Ib if (useElements) { void * pIb; u32 size = sizeof(short) * indexGen.VertexCount(); pD3Ddevice->CreateIndexBuffer(size, NULL, D3DFMT_INDEX16, D3DPOOL_DEFAULT, &vai->ebo, NULL); vai->ebo->Lock(0, size, &pIb, D3DLOCK_NOOVERWRITE ); memcpy(pIb, decIndex, size); vai->ebo->Unlock(); } else { vai->ebo = 0; } } else { gpuStats.numCachedDrawCalls++; useElements = vai->ebo ? true : false; gpuStats.numCachedVertsDrawn += vai->numVerts; gstate_c.vertexFullAlpha = vai->flags & VAI_FLAG_VERTEXFULLALPHA; } vb_ = vai->vbo; ib_ = vai->ebo; vertexCount = vai->numVerts; maxIndex = vai->maxIndex; prim = static_cast(vai->prim); break; } // Reliable - we don't even bother hashing anymore. Right now we don't go here until after a very long time. case VertexArrayInfoDX9::VAI_RELIABLE: { vai->numDraws++; if (vai->lastFrame != gpuStats.numFlips) { vai->numFrames++; } gpuStats.numCachedDrawCalls++; gpuStats.numCachedVertsDrawn += vai->numVerts; vb_ = vai->vbo; ib_ = vai->ebo; vertexCount = vai->numVerts; maxIndex = vai->maxIndex; prim = static_cast(vai->prim); gstate_c.vertexFullAlpha = vai->flags & VAI_FLAG_VERTEXFULLALPHA; break; } case VertexArrayInfoDX9::VAI_UNRELIABLE: { vai->numDraws++; if (vai->lastFrame != gpuStats.numFlips) { vai->numFrames++; } DecodeVerts(); goto rotateVBO; } } vai->lastFrame = gpuStats.numFlips; } else { DecodeVerts(); rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); useElements = !indexGen.SeenOnlyPurePrims(); vertexCount = indexGen.VertexCount(); maxIndex = indexGen.MaxIndex(); if (!useElements && indexGen.PureCount()) { vertexCount = indexGen.PureCount(); } prim = indexGen.Prim(); } DEBUG_LOG(G3D, "Flush prim %i! %i verts in one go", prim, vertexCount); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); } else { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); } IDirect3DVertexDeclaration9 *pHardwareVertexDecl = SetupDecFmtForDraw(vshader, dec_->GetDecVtxFmt(), dec_->VertexType()); if (pHardwareVertexDecl) { pD3Ddevice->SetVertexDeclaration(pHardwareVertexDecl); if (vb_ == NULL) { if (useElements) { pD3Ddevice->DrawIndexedPrimitiveUP(glprim[prim], 0, vertexCount, D3DPrimCount(glprim[prim], vertexCount), decIndex, D3DFMT_INDEX16, decoded, dec_->GetDecVtxFmt().stride); } else { pD3Ddevice->DrawPrimitiveUP(glprim[prim], D3DPrimCount(glprim[prim], vertexCount), decoded, dec_->GetDecVtxFmt().stride); } } else { pD3Ddevice->SetStreamSource(0, vb_, 0, dec_->GetDecVtxFmt().stride); if (useElements) { pD3Ddevice->SetIndices(ib_); pD3Ddevice->DrawIndexedPrimitive(glprim[prim], 0, 0, 0, 0, D3DPrimCount(glprim[prim], vertexCount)); } else { pD3Ddevice->DrawPrimitive(glprim[prim], 0, D3DPrimCount(glprim[prim], vertexCount)); } } } } else { DecodeVerts(); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); } else { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); } gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); prim = indexGen.Prim(); // Undo the strip optimization, not supported by the SW code yet. if (prim == GE_PRIM_TRIANGLE_STRIP) prim = GE_PRIM_TRIANGLES; DEBUG_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount()); SoftwareTransformAndDraw( prim, decoded, indexGen.VertexCount(), dec_->VertexType(), (void *)decIndex, GE_VTYPE_IDX_16BIT, dec_->GetDecVtxFmt(), indexGen.MaxIndex()); } indexGen.Reset(); collectedVerts = 0; numDrawCalls = 0; vertexCountInDrawCalls = 0; prevPrim_ = GE_PRIM_INVALID; gstate_c.vertexFullAlpha = true; host->GPUNotifyDraw(); } bool TransformDrawEngineDX9::TestBoundingBox(void* control_points, int vertexCount, u32 vertType) { // Simplify away bones and morph before proceeding /* SimpleVertex *corners = (SimpleVertex *)(decoded + 65536 * 12); u8 *temp_buffer = decoded + 65536 * 24; u32 origVertType = vertType; vertType = NormalizeVertices((u8 *)corners, temp_buffer, (u8 *)control_points, 0, vertexCount, vertType); for (int cube = 0; cube < vertexCount / 8; cube++) { // For each cube... for (int i = 0; i < 8; i++) { const SimpleVertex &vert = corners[cube * 8 + i]; // To world space... float worldPos[3]; Vec3ByMatrix43(worldPos, (float *)&vert.pos.x, gstate.worldMatrix); // To view space... float viewPos[3]; Vec3ByMatrix43(viewPos, worldPos, gstate.viewMatrix); // And finally to screen space. float frustumPos[4]; Vec3ByMatrix44(frustumPos, viewPos, gstate.projMatrix); // Project to 2D float x = frustumPos[0] / frustumPos[3]; float y = frustumPos[1] / frustumPos[3]; // Rescale 2d position // ... } } */ // Let's think. A better approach might be to take the edges of the drawing region and the projection // matrix to build a frustum pyramid, and then clip the cube against those planes. If all vertices fail the same test, // the cube is out. Otherwise it's in. // TODO.... return true; } // TODO: Probably move this to common code (with normalization?) static Vec3f ClipToScreen(const Vec4f& coords) { // TODO: Check for invalid parameters (x2 < x1, etc) float vpx1 = getFloat24(gstate.viewportx1); float vpx2 = getFloat24(gstate.viewportx2); float vpy1 = getFloat24(gstate.viewporty1); float vpy2 = getFloat24(gstate.viewporty2); float vpz1 = getFloat24(gstate.viewportz1); float vpz2 = getFloat24(gstate.viewportz2); float retx = coords.x * vpx1 / coords.w + vpx2; float rety = coords.y * vpy1 / coords.w + vpy2; float retz = coords.z * vpz1 / coords.w + vpz2; // 16 = 0xFFFF / 4095.9375 return Vec3f(retx * 16, rety * 16, retz); } static Vec3f ScreenToDrawing(const Vec3f& coords) { Vec3f ret; ret.x = (coords.x - gstate.getOffsetX16()) * (1.0f / 16.0f); ret.y = (coords.y - gstate.getOffsetY16()) * (1.0f / 16.0f); ret.z = coords.z; return ret; } // TODO: This probably is not the best interface. bool TransformDrawEngineDX9::GetCurrentSimpleVertices(int count, std::vector &vertices, std::vector &indices) { // This is always for the current vertices. u16 indexLowerBound = 0; u16 indexUpperBound = count - 1; bool savedVertexFullAlpha = gstate_c.vertexFullAlpha; if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) { const u8 *inds = Memory::GetPointer(gstate_c.indexAddr); const u16 *inds16 = (const u16 *)inds; if (inds) { GetIndexBounds(inds, count, gstate.vertType, &indexLowerBound, &indexUpperBound); indices.resize(count); switch (gstate.vertType & GE_VTYPE_IDX_MASK) { case GE_VTYPE_IDX_16BIT: for (int i = 0; i < count; ++i) { indices[i] = inds16[i]; } break; case GE_VTYPE_IDX_8BIT: for (int i = 0; i < count; ++i) { indices[i] = inds[i]; } break; default: return false; } } else { indices.clear(); } } else { indices.clear(); } static std::vector temp_buffer; static std::vector simpleVertices; temp_buffer.resize(std::max((int)indexUpperBound, 8192) * 128 / sizeof(u32)); simpleVertices.resize(indexUpperBound + 1); NormalizeVertices((u8 *)(&simpleVertices[0]), (u8 *)(&temp_buffer[0]), Memory::GetPointer(gstate_c.vertexAddr), indexLowerBound, indexUpperBound, gstate.vertType); float world[16]; float view[16]; float worldview[16]; float worldviewproj[16]; ConvertMatrix4x3To4x4(world, gstate.worldMatrix); ConvertMatrix4x3To4x4(view, gstate.viewMatrix); Matrix4ByMatrix4(worldview, world, view); Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); vertices.resize(indexUpperBound + 1); for (int i = indexLowerBound; i <= indexUpperBound; ++i) { const SimpleVertex &vert = simpleVertices[i]; if (gstate.isModeThrough()) { if (gstate.vertType & GE_VTYPE_TC_MASK) { vertices[i].u = vert.uv[0]; vertices[i].v = vert.uv[1]; } else { vertices[i].u = 0.0f; vertices[i].v = 0.0f; } vertices[i].x = vert.pos.x; vertices[i].y = vert.pos.y; vertices[i].z = vert.pos.z; if (gstate.vertType & GE_VTYPE_COL_MASK) { memcpy(vertices[i].c, vert.color, sizeof(vertices[i].c)); } else { memset(vertices[i].c, 0, sizeof(vertices[i].c)); } } else { float clipPos[4]; Vec3ByMatrix44(clipPos, vert.pos.AsArray(), worldviewproj); Vec3f screenPos = ClipToScreen(clipPos); Vec3f drawPos = ScreenToDrawing(screenPos); if (gstate.vertType & GE_VTYPE_TC_MASK) { vertices[i].u = vert.uv[0]; vertices[i].v = vert.uv[1]; } else { vertices[i].u = 0.0f; vertices[i].v = 0.0f; } vertices[i].x = drawPos.x; vertices[i].y = drawPos.y; vertices[i].z = drawPos.z; if (gstate.vertType & GE_VTYPE_COL_MASK) { memcpy(vertices[i].c, vert.color, sizeof(vertices[i].c)); } else { memset(vertices[i].c, 0, sizeof(vertices[i].c)); } } } gstate_c.vertexFullAlpha = savedVertexFullAlpha; return true; } } // namespace