// Copyright (c) 2022- PPSSPP Project.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License 2.0 for more details.

// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/

// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

#pragma once

#include <atomic>
#include <unordered_map>
#include "GPU/Software/Rasterizer.h"

struct BinWaitable;
class DrawBinItemsTask;

enum class BinItemType : uint8_t {
	TRIANGLE,
	CLEAR_RECT,
	RECT,
	SPRITE,
	LINE,
	POINT,
};

struct BinCoords {
	int x1;
	int y1;
	int x2;
	int y2;

	bool Invalid() const {
		return x2 < x1 || y2 < y1;
	}

	BinCoords Intersect(const BinCoords &range) const;
};

struct BinItem {
	BinItemType type;
	uint16_t stateIndex;
	BinCoords range;
	VertexData v0;
	VertexData v1;
	VertexData v2;
};

template <typename T, size_t N>
struct BinQueue {
	BinQueue() {
		Reset();
	}
	~BinQueue() {
		FreeAlignedMemory(items_);
	}

	void Setup() {
		items_ = (T *)AllocateAlignedMemory(sizeof_, 16);
	}

	void Reset() {
		head_ = 0;
		tail_ = 0;
		size_ = 0;
	}

	size_t Push(const T &item) {
		size_t i = tail_++;
		if (i + 1 == N)
			tail_ -= N;
		items_[i] = item;
		size_++;
		return i;
	}

	T Pop() {
		size_t i = head_++;
		if (i + 1 == N)
			head_ -= N;
		T item = items_[i];
		size_--;
		return item;
	}

	// Only safe if you're the only one reading.
	T &PeekNext() {
		return items_[head_];
	}

	void SkipNext() {
		size_t i = head_++;
		if (i + 1 == N)
			head_ -= N;
		size_--;
	}

	// Only safe if you're the only one reading.
	const T &Peek(size_t offset) const {
		size_t i = head_ + offset;
		if (i >= N)
			i -= N;
		return items_[i];
	}

	// Only safe if you're the only one writing.
	T &PeekPush() {
		return items_[tail_];
	}

	size_t PushPeeked() {
		size_t i = tail_++;
		if (i + 1 == N)
			tail_ -= N;
		size_++;
		return i;
	}

	size_t Size() const {
		return size_;
	}

	bool Full() const {
		return size_ == N - 1;
	}

	bool NearFull() const {
		return size_ >= N - 2;
	}

	bool Empty() const {
		return size_ == 0;
	}

	T &operator[](size_t index) {
		return items_[index];
	}

	const T &operator[](size_t index) const {
		return items_[index];
	}

	T *items_ = nullptr;
	std::atomic<size_t> head_;
	std::atomic<size_t> tail_ ;
	std::atomic<size_t> size_;
	static constexpr size_t sizeof_ = sizeof(T) * N;
};

union BinClut {
	uint8_t readable[1024];
};

struct BinTaskList {
	// We shouldn't ever need more than two at once, since we use an atomic to run one at a time.
	// A second could run due to overlap during teardown.
	static constexpr int N = 2;

	DrawBinItemsTask *tasks[N]{};
	int count = 0;

	DrawBinItemsTask *Next() {
		return tasks[count % N];
	}
};

struct BinDirtyRange {
	uint32_t base;
	uint32_t strideBytes;
	uint32_t widthBytes;
	uint32_t height;

	void Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, const DrawingCoords &tl, const DrawingCoords &br);
};

class BinManager {
public:
	BinManager();
	~BinManager();

	void UpdateState();
	void UpdateClut(const void *src);

	const Rasterizer::RasterizerState &State() {
		return states_[stateIndex_];
	}

	void AddTriangle(const VertexData &v0, const VertexData &v1, const VertexData &v2);
	void AddClearRect(const VertexData &v0, const VertexData &v1);
	void AddRect(const VertexData &v0, const VertexData &v1);
	void AddSprite(const VertexData &v0, const VertexData &v1);
	void AddLine(const VertexData &v0, const VertexData &v1);
	void AddPoint(const VertexData &v0);

	void Drain(bool flushing = false);
	void Flush(const char *reason);
	bool HasPendingWrite(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);
	// Assumes you've also checked for a write (writes are partial so are automatically reads.)
	bool HasPendingRead(uint32_t start, uint32_t stride, uint32_t w, uint32_t h);

	void GetStats(char *buffer, size_t bufsize);
	void ResetStats();

	void SetDirty(SoftDirty flags) {
		dirty_ |= flags;
	}
	void ClearDirty(SoftDirty flags) {
		dirty_ &= ~flags;
	}
	SoftDirty GetDirty() {
		return dirty_;
	}
	bool HasDirty(SoftDirty flags) {
		return dirty_ & flags;
	}

protected:
#if PPSSPP_ARCH(32BIT)
	// Use less memory and less address space.  We're unlikely to have 32 cores on a 32-bit CPU.
	static constexpr int MAX_POSSIBLE_TASKS = 16;
#else
	static constexpr int MAX_POSSIBLE_TASKS = 64;
#endif
	// This is about 1MB of state data.
	static constexpr int QUEUED_STATES = 4096;
	// These are 1KB each, so half an MB.
	static constexpr int QUEUED_CLUTS = 512;
	// About 360 KB, but we have usually 16 or less of them, so 5 MB - 22 MB.
	static constexpr int QUEUED_PRIMS = 2048;

	typedef BinQueue<Rasterizer::RasterizerState, QUEUED_STATES> BinStateQueue;
	typedef BinQueue<BinClut, QUEUED_CLUTS> BinClutQueue;
	typedef BinQueue<BinItem, QUEUED_PRIMS> BinItemQueue;

private:
	BinStateQueue states_;
	BinClutQueue cluts_;
	uint16_t stateIndex_;
	uint16_t clutIndex_;
	BinCoords scissor_;
	BinItemQueue queue_;
	BinCoords queueRange_;
	SoftDirty dirty_ = SoftDirty::NONE;

	int maxTasks_ = 1;
	bool tasksSplit_ = false;
	std::vector<BinCoords> taskRanges_;
	BinItemQueue taskQueues_[MAX_POSSIBLE_TASKS];
	BinTaskList taskLists_[MAX_POSSIBLE_TASKS];
	std::atomic<bool> taskStatus_[MAX_POSSIBLE_TASKS];
	BinWaitable *waitable_ = nullptr;

	BinDirtyRange pendingWrites_[2]{};
	std::unordered_map<uint32_t, BinDirtyRange> pendingReads_;

	bool pendingOverlap_ = false;
	bool creatingState_ = false;
	uint16_t pendingStateIndex_ = 0;

	std::unordered_map<const char *, double> flushReasonTimes_;
	std::unordered_map<const char *, double> lastFlushReasonTimes_;
	const char *slowestFlushReason_ = nullptr;
	double slowestFlushTime_ = 0.0;
	int lastFlipstats_ = 0;
	int enqueues_ = 0;
	int mostThreads_ = 0;

	void MarkPendingReads(const Rasterizer::RasterizerState &state);
	void MarkPendingWrites(const Rasterizer::RasterizerState &state);
	bool HasTextureWrite(const Rasterizer::RasterizerState &state);
	bool IsExactSelfRender(const Rasterizer::RasterizerState &state, const BinItem &item);
	void OptimizePendingStates(uint16_t first, uint16_t last);
	BinCoords Scissor(BinCoords range);
	BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
	BinCoords Range(const VertexData &v0, const VertexData &v1);
	BinCoords Range(const VertexData &v0);
	void Expand(const BinCoords &range);

	friend class DrawBinItemsTask;
};