SaveState: Update to Snappy 453942b.

This commit is contained in:
Unknown W. Brackets 2021-02-22 22:24:04 -08:00
parent 39198b53a5
commit 7a9cbdfd6a
8 changed files with 1276 additions and 757 deletions

View file

@ -46,16 +46,16 @@ class WorkingMemory {
// Allocates and clears a hash table using memory in "*this",
// stores the number of buckets in "*table_size" and returns a pointer to
// the base of the hash table.
uint16* GetHashTable(size_t fragment_size, int* table_size) const;
uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
char* GetScratchInput() const { return input_; }
char* GetScratchOutput() const { return output_; }
private:
char* mem_; // the allocated memory, never nullptr
size_t size_; // the size of the allocated memory, never 0
uint16* table_; // the pointer to the hashtable
char* input_; // the pointer to the input scratch buffer
char* output_; // the pointer to the output scratch buffer
char* mem_; // the allocated memory, never nullptr
size_t size_; // the size of the allocated memory, never 0
uint16_t* table_; // the pointer to the hashtable
char* input_; // the pointer to the input scratch buffer
char* output_; // the pointer to the output scratch buffer
// No copying
WorkingMemory(const WorkingMemory&);
@ -76,7 +76,7 @@ class WorkingMemory {
char* CompressFragment(const char* input,
size_t input_length,
char* op,
uint16* table,
uint16_t* table,
const int table_size);
// Find the largest n such that
@ -89,12 +89,18 @@ char* CompressFragment(const char* input,
// Does not read *(s1 + (s2_limit - s2)) or beyond.
// Requires that s2_limit >= s2.
//
// In addition populate *data with the next 5 bytes from the end of the match.
// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
// that on some arch's this can be done faster in this routine than subsequent
// loading from s2 + n.
//
// Separate implementation for 64-bit, little-endian cpus.
#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
(defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || defined(ARCH_ARM))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit) {
const char* s2_limit,
uint64_t* data) {
assert(s2_limit >= s2);
size_t matched = 0;
@ -103,12 +109,71 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
// uncommon code paths that determine, without extra effort, whether the match
// length is less than 8. In short, we are hoping to avoid a conditional
// branch, and perhaps get better code layout from the C++ compiler.
if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
uint64 a1 = UNALIGNED_LOAD64(s1);
uint64 a2 = UNALIGNED_LOAD64(s2);
if (a1 != a2) {
return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3,
true);
if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
uint64_t a1 = UNALIGNED_LOAD64(s1);
uint64_t a2 = UNALIGNED_LOAD64(s2);
if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
// This code is critical for performance. The reason is that it determines
// how much to advance `ip` (s2). This obviously depends on both the loads
// from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
// depends on the advanced `ip` calculated here through a load, hash and
// new candidate hash lookup (a lot of cycles). This makes s1 (ie.
// `candidate`) the variable that limits throughput. This is the reason we
// go through hoops to have this function update `data` for the next iter.
// The straightforward code would use *data, given by
//
// *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
//
// as input for the hash table lookup to find next candidate. However
// this forces the load on the data dependency chain of s1, because
// matched_bytes directly depends on s1. However matched_bytes is 0..7, so
// we can also calculate *data by
//
// *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
// matched_bytes);
//
// The loads do not depend on s1 anymore and are thus off the bottleneck.
// The straightforward implementation on x86_64 would be to use
//
// shrd rax, rdx, cl (cl being matched_bytes * 8)
//
// unfortunately shrd with a variable shift has a 4 cycle latency. So this
// only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
// shift instruction but can only shift 64 bits. If we focus on just
// obtaining the least significant 4 bytes, we can obtain this by
//
// *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
// UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
//
// Writen like above this is not a big win, the conditional move would be
// a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
// However matched_bytes < 4 is equal to
// static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
// move (2 cycles) can execute in parallel with FindLSBSetNonZero64
// (tzcnt), which takes 3 cycles.
uint64_t xorval = a1 ^ a2;
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
#ifndef __x86_64__
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
#else
// Ideally this would just be
//
// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
//
// However clang correctly infers that the above statement participates on
// a critical data dependency chain and thus, unfortunately, refuses to
// use a conditional move (it's tuned to cut data dependencies). In this
// case there is a longer parallel chain anyway AND this will be fairly
// unpredictable.
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval));
*data = a2 >> (shift & (3 * 8));
#endif
return std::pair<size_t, bool>(matched_bytes, true);
} else {
matched = 8;
s2 += 8;
@ -119,14 +184,27 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
// time until we find a 64-bit block that doesn't match; then we find
// the first non-matching bit and use that to calculate the total
// length of the match.
while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
uint64_t a2 = UNALIGNED_LOAD64(s2);
if (a1 == a2) {
s2 += 8;
matched += 8;
} else {
uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
int matching_bits = Bits::FindLSBSetNonZero64(x);
matched += matching_bits >> 3;
uint64_t xorval = a1 ^ a2;
int shift = Bits::FindLSBSetNonZero64(xorval);
size_t matched_bytes = shift >> 3;
#ifndef __x86_64__
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
#else
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
asm("testl %k2, %k2\n\t"
"cmovzq %1, %0\n\t"
: "+r"(a2)
: "r"(a3), "r"(xorval));
*data = a2 >> (shift & (3 * 8));
#endif
matched += matched_bytes;
assert(matched >= 8);
return std::pair<size_t, bool>(matched, false);
}
@ -136,6 +214,9 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
++s2;
++matched;
} else {
if (s2 <= s2_limit - 8) {
*data = UNALIGNED_LOAD64(s2);
}
return std::pair<size_t, bool>(matched, matched < 8);
}
}
@ -144,7 +225,8 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
#else
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit) {
const char* s2_limit,
uint64_t* data) {
// Implementation based on the x86-64 version, above.
assert(s2_limit >= s2);
int matched = 0;
@ -155,15 +237,17 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
matched += 4;
}
if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
uint32 x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
int matching_bits = Bits::FindLSBSetNonZero(x);
matched += matching_bits >> 3;
s2 += matching_bits >> 3;
} else {
while ((s2 < s2_limit) && (s1[matched] == *s2)) {
++s2;
++matched;
}
}
if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
return std::pair<size_t, bool>(matched, matched < 8);
}
#endif
@ -190,7 +274,8 @@ static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual o
// because of efficiency reasons:
// (1) Extracting a byte is faster than a bit-field
// (2) It properly aligns copy offset so we do not need a <<8
static const uint16 char_table[256] = {
static constexpr uint16_t char_table[256] = {
// clang-format off
0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
@ -222,7 +307,8 @@ static const uint16 char_table[256] = {
0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
// clang-format on
};
} // end namespace internal

View file

@ -26,23 +26,31 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#include <stddef.h>
#include <cstring>
#include "snappy-sinksource.h"
namespace snappy {
Source::~Source() { }
Source::~Source() = default;
Sink::~Sink() { }
Sink::~Sink() = default;
char* Sink::GetAppendBuffer(size_t length, char* scratch) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)length;
return scratch;
}
char* Sink::GetAppendBufferVariable(
size_t min_size, size_t desired_size_hint, char* scratch,
size_t scratch_size, size_t* allocated_size) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)min_size;
(void)desired_size_hint;
*allocated_size = scratch_size;
return scratch;
}
@ -55,7 +63,7 @@ void Sink::AppendAndTakeOwnership(
(*deleter)(deleter_arg, bytes, n);
}
ByteArraySource::~ByteArraySource() { }
ByteArraySource::~ByteArraySource() = default;
size_t ByteArraySource::Available() const { return left_; }
@ -74,22 +82,26 @@ UncheckedByteArraySink::~UncheckedByteArraySink() { }
void UncheckedByteArraySink::Append(const char* data, size_t n) {
// Do no copying if the caller filled in the result of GetAppendBuffer()
if (data != dest_) {
memcpy(dest_, data, n);
std::memcpy(dest_, data, n);
}
dest_ += n;
}
char* UncheckedByteArraySink::GetAppendBuffer(size_t len, char* scratch) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)len;
(void)scratch;
return dest_;
}
void UncheckedByteArraySink::AppendAndTakeOwnership(
char* data, size_t n,
char* bytes, size_t n,
void (*deleter)(void*, const char*, size_t),
void *deleter_arg) {
if (data != dest_) {
memcpy(dest_, data, n);
(*deleter)(deleter_arg, data, n);
if (bytes != dest_) {
std::memcpy(dest_, bytes, n);
(*deleter)(deleter_arg, bytes, n);
}
dest_ += n;
}
@ -97,6 +109,11 @@ void UncheckedByteArraySink::AppendAndTakeOwnership(
char* UncheckedByteArraySink::GetAppendBufferVariable(
size_t min_size, size_t desired_size_hint, char* scratch,
size_t scratch_size, size_t* allocated_size) {
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
(void)min_size;
(void)scratch;
(void)scratch_size;
*allocated_size = desired_size_hint;
return dest_;
}

View file

@ -146,10 +146,10 @@ class Source {
class ByteArraySource : public Source {
public:
ByteArraySource(const char* p, size_t n) : ptr_(p), left_(n) { }
virtual ~ByteArraySource();
virtual size_t Available() const;
virtual const char* Peek(size_t* len);
virtual void Skip(size_t n);
~ByteArraySource() override;
size_t Available() const override;
const char* Peek(size_t* len) override;
void Skip(size_t n) override;
private:
const char* ptr_;
size_t left_;
@ -159,15 +159,15 @@ class ByteArraySource : public Source {
class UncheckedByteArraySink : public Sink {
public:
explicit UncheckedByteArraySink(char* dest) : dest_(dest) { }
virtual ~UncheckedByteArraySink();
virtual void Append(const char* data, size_t n);
virtual char* GetAppendBuffer(size_t len, char* scratch);
virtual char* GetAppendBufferVariable(
~UncheckedByteArraySink() override;
void Append(const char* data, size_t n) override;
char* GetAppendBuffer(size_t len, char* scratch) override;
char* GetAppendBufferVariable(
size_t min_size, size_t desired_size_hint, char* scratch,
size_t scratch_size, size_t* allocated_size);
virtual void AppendAndTakeOwnership(
size_t scratch_size, size_t* allocated_size) override;
void AppendAndTakeOwnership(
char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
void *deleter_arg);
void *deleter_arg) override;
// Return the current output pointer so that a caller can see how
// many bytes were produced.

View file

@ -33,7 +33,7 @@
namespace snappy {
void Varint::Append32(std::string* s, uint32 value) {
void Varint::Append32(std::string* s, uint32_t value) {
char buf[Varint::kMax32];
const char* p = Varint::Encode32(buf, value);
s->append(buf, p - buf);

View file

@ -35,11 +35,13 @@
#include "config.h"
#endif
#include <string>
#include <stdint.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <string>
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
@ -67,19 +69,11 @@
#include "snappy-stubs-public.h"
#if defined(__x86_64__)
// Enable 64-bit optimized versions of some routines.
#define ARCH_K8 1
#elif defined(__ppc64__)
// Used to enable 64-bit optimized versions of some routines.
#if defined(__PPC64__) || defined(__powerpc64__)
#define ARCH_PPC 1
#elif defined(__aarch64__)
#elif defined(__aarch64__) || defined(_M_ARM64)
#define ARCH_ARM 1
#endif
// Needed by OS X, among others.
@ -93,7 +87,7 @@
#ifdef ARRAYSIZE
#undef ARRAYSIZE
#endif
#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
#define ARRAYSIZE(a) int{sizeof(a) / sizeof(*(a))}
// Static prediction hints.
#ifdef HAVE_BUILTIN_EXPECT
@ -104,6 +98,13 @@
#define SNAPPY_PREDICT_TRUE(x) x
#endif
// Inlining hints.
#ifdef HAVE_ATTRIBUTE_ALWAYS_INLINE
#define SNAPPY_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#else
#define SNAPPY_ATTRIBUTE_ALWAYS_INLINE
#endif
// This is only used for recomputing the tag byte table used during
// decompression; for simplicity we just remove it from the open-source
// version (anyone who wants to regenerate it can just do the call
@ -115,201 +116,47 @@
namespace snappy {
static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
static const uint32_t kuint32max = std::numeric_limits<uint32_t>::max();
static const int64_t kint64max = std::numeric_limits<int64_t>::max();
// Potentially unaligned loads and stores.
// x86, PowerPC, and ARM64 can simply do these loads and stores native.
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \
defined(__aarch64__)
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
// do an unaligned read and rotate the words around a bit, or do the reads very
// slowly (trip through kernel mode). There's no simple #define that says just
// "ARMv7 or higher", so we have to filter away all ARMv5 and ARMv6
// sub-architectures.
//
// This is a mess, but there's not much we can do about it.
//
// To further complicate matters, only LDR instructions (single reads) are
// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
// explicitly tell the compiler that these accesses can be unaligned, it can and
// will combine accesses. On armcc, the way to signal this is done by accessing
// through the type (uint32 __packed *), but GCC has no such attribute
// (it ignores __attribute__((packed)) on individual variables). However,
// we can tell it that a _struct_ is unaligned, which has the same effect,
// so we do that.
#elif defined(__arm__) && \
!defined(__ARM_ARCH_4__) && \
!defined(__ARM_ARCH_4T__) && \
!defined(__ARM_ARCH_5__) && \
!defined(__ARM_ARCH_5T__) && \
!defined(__ARM_ARCH_5TE__) && \
!defined(__ARM_ARCH_5TEJ__) && \
!defined(__ARM_ARCH_6__) && \
!defined(__ARM_ARCH_6J__) && \
!defined(__ARM_ARCH_6K__) && \
!defined(__ARM_ARCH_6Z__) && \
!defined(__ARM_ARCH_6ZK__) && \
!defined(__ARM_ARCH_6T2__)
#if __GNUC__
#define ATTRIBUTE_PACKED __attribute__((__packed__))
#else
#define ATTRIBUTE_PACKED
#endif
namespace base {
namespace internal {
struct Unaligned16Struct {
uint16 value;
uint8 dummy; // To make the size non-power-of-two.
} ATTRIBUTE_PACKED;
struct Unaligned32Struct {
uint32 value;
uint8 dummy; // To make the size non-power-of-two.
} ATTRIBUTE_PACKED;
} // namespace internal
} // namespace base
#define UNALIGNED_LOAD16(_p) \
((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value)
#define UNALIGNED_LOAD32(_p) \
((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value)
#define UNALIGNED_STORE16(_p, _val) \
((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \
(_val))
#define UNALIGNED_STORE32(_p, _val) \
((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \
(_val))
// TODO: NEON supports unaligned 64-bit loads and stores.
// See if that would be more efficient on platforms supporting it,
// at least for copies.
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
memcpy(&t, p, sizeof t);
return t;
inline uint16_t UNALIGNED_LOAD16(const void *p) {
// Compiles to a single movzx/ldrh on clang/gcc/msvc.
uint16_t v;
std::memcpy(&v, p, sizeof(v));
return v;
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v);
inline uint32_t UNALIGNED_LOAD32(const void *p) {
// Compiles to a single mov/ldr on clang/gcc/msvc.
uint32_t v;
std::memcpy(&v, p, sizeof(v));
return v;
}
#else
// These functions are provided for architectures that don't support
// unaligned loads and stores.
inline uint16 UNALIGNED_LOAD16(const void *p) {
uint16 t;
memcpy(&t, p, sizeof t);
return t;
inline uint64_t UNALIGNED_LOAD64(const void *p) {
// Compiles to a single mov/ldr on clang/gcc/msvc.
uint64_t v;
std::memcpy(&v, p, sizeof(v));
return v;
}
inline uint32 UNALIGNED_LOAD32(const void *p) {
uint32 t;
memcpy(&t, p, sizeof t);
return t;
inline void UNALIGNED_STORE16(void *p, uint16_t v) {
// Compiles to a single mov/strh on clang/gcc/msvc.
std::memcpy(p, &v, sizeof(v));
}
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
memcpy(&t, p, sizeof t);
return t;
inline void UNALIGNED_STORE32(void *p, uint32_t v) {
// Compiles to a single mov/str on clang/gcc/msvc.
std::memcpy(p, &v, sizeof(v));
}
inline void UNALIGNED_STORE16(void *p, uint16 v) {
memcpy(p, &v, sizeof v);
inline void UNALIGNED_STORE64(void *p, uint64_t v) {
// Compiles to a single mov/str on clang/gcc/msvc.
std::memcpy(p, &v, sizeof(v));
}
inline void UNALIGNED_STORE32(void *p, uint32 v) {
memcpy(p, &v, sizeof v);
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v);
}
#endif
// The following guarantees declaration of the byte swap functions.
#if defined(SNAPPY_IS_BIG_ENDIAN)
#ifdef HAVE_SYS_BYTEORDER_H
#include <sys/byteorder.h>
#endif
#ifdef HAVE_SYS_ENDIAN_H
#include <sys/endian.h>
#endif
#ifdef _MSC_VER
#include <stdlib.h>
#define bswap_16(x) _byteswap_ushort(x)
#define bswap_32(x) _byteswap_ulong(x)
#define bswap_64(x) _byteswap_uint64(x)
#elif defined(__APPLE__)
// Mac OS X / Darwin features
#include <libkern/OSByteOrder.h>
#define bswap_16(x) OSSwapInt16(x)
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
#elif defined(HAVE_BYTESWAP_H)
#include <byteswap.h>
#elif defined(bswap32)
// FreeBSD defines bswap{16,32,64} in <sys/endian.h> (already #included).
#define bswap_16(x) bswap16(x)
#define bswap_32(x) bswap32(x)
#define bswap_64(x) bswap64(x)
#elif defined(BSWAP_64)
// Solaris 10 defines BSWAP_{16,32,64} in <sys/byteorder.h> (already #included).
#define bswap_16(x) BSWAP_16(x)
#define bswap_32(x) BSWAP_32(x)
#define bswap_64(x) BSWAP_64(x)
#else
inline uint16 bswap_16(uint16 x) {
return (x << 8) | (x >> 8);
}
inline uint32 bswap_32(uint32 x) {
x = ((x & 0xff00ff00UL) >> 8) | ((x & 0x00ff00ffUL) << 8);
return (x >> 16) | (x << 16);
}
inline uint64 bswap_64(uint64 x) {
x = ((x & 0xff00ff00ff00ff00ULL) >> 8) | ((x & 0x00ff00ff00ff00ffULL) << 8);
x = ((x & 0xffff0000ffff0000ULL) >> 16) | ((x & 0x0000ffff0000ffffULL) << 16);
return (x >> 32) | (x << 32);
}
#endif
#endif // defined(SNAPPY_IS_BIG_ENDIAN)
// Convert to little-endian storage, opposite of network format.
// Convert x from host to little endian: x = LittleEndian.FromHost(x);
// convert x from little endian to host: x = LittleEndian.ToHost(x);
@ -321,44 +168,77 @@ inline uint64 bswap_64(uint64 x) {
// x = LittleEndian.Load16(p);
class LittleEndian {
public:
// Conversion functions.
#if defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return bswap_16(x); }
static uint16 ToHost16(uint16 x) { return bswap_16(x); }
static uint32 FromHost32(uint32 x) { return bswap_32(x); }
static uint32 ToHost32(uint32 x) { return bswap_32(x); }
static bool IsLittleEndian() { return false; }
#else // !defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return x; }
static uint16 ToHost16(uint16 x) { return x; }
static uint32 FromHost32(uint32 x) { return x; }
static uint32 ToHost32(uint32 x) { return x; }
static bool IsLittleEndian() { return true; }
#endif // !defined(SNAPPY_IS_BIG_ENDIAN)
// Functions to do unaligned loads and stores in little-endian order.
static uint16 Load16(const void *p) {
return ToHost16(UNALIGNED_LOAD16(p));
static inline uint16_t Load16(const void *ptr) {
const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
// Compiles to a single mov/str on recent clang and gcc.
return (static_cast<uint16_t>(buffer[0])) |
(static_cast<uint16_t>(buffer[1]) << 8);
}
static void Store16(void *p, uint16 v) {
UNALIGNED_STORE16(p, FromHost16(v));
static inline uint32_t Load32(const void *ptr) {
const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
// Compiles to a single mov/str on recent clang and gcc.
return (static_cast<uint32_t>(buffer[0])) |
(static_cast<uint32_t>(buffer[1]) << 8) |
(static_cast<uint32_t>(buffer[2]) << 16) |
(static_cast<uint32_t>(buffer[3]) << 24);
}
static uint32 Load32(const void *p) {
return ToHost32(UNALIGNED_LOAD32(p));
static inline uint64_t Load64(const void *ptr) {
const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
// Compiles to a single mov/str on recent clang and gcc.
return (static_cast<uint64_t>(buffer[0])) |
(static_cast<uint64_t>(buffer[1]) << 8) |
(static_cast<uint64_t>(buffer[2]) << 16) |
(static_cast<uint64_t>(buffer[3]) << 24) |
(static_cast<uint64_t>(buffer[4]) << 32) |
(static_cast<uint64_t>(buffer[5]) << 40) |
(static_cast<uint64_t>(buffer[6]) << 48) |
(static_cast<uint64_t>(buffer[7]) << 56);
}
static void Store32(void *p, uint32 v) {
UNALIGNED_STORE32(p, FromHost32(v));
static inline void Store16(void *dst, uint16_t value) {
uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
// Compiles to a single mov/str on recent clang and gcc.
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
}
static void Store32(void *dst, uint32_t value) {
uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
// Compiles to a single mov/str on recent clang and gcc.
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
buffer[3] = static_cast<uint8_t>(value >> 24);
}
static void Store64(void* dst, uint64_t value) {
uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
// Compiles to a single mov/str on recent clang and gcc.
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
buffer[3] = static_cast<uint8_t>(value >> 24);
buffer[4] = static_cast<uint8_t>(value >> 32);
buffer[5] = static_cast<uint8_t>(value >> 40);
buffer[6] = static_cast<uint8_t>(value >> 48);
buffer[7] = static_cast<uint8_t>(value >> 56);
}
static inline constexpr bool IsLittleEndian() {
#if defined(SNAPPY_IS_BIG_ENDIAN)
return false;
#else
return true;
#endif // defined(SNAPPY_IS_BIG_ENDIAN)
}
};
@ -366,19 +246,17 @@ class LittleEndian {
class Bits {
public:
// Return floor(log2(n)) for positive integer n.
static int Log2FloorNonZero(uint32 n);
static int Log2FloorNonZero(uint32_t n);
// Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0.
static int Log2Floor(uint32 n);
static int Log2Floor(uint32_t n);
// Return the first set least / most significant bit, 0-indexed. Returns an
// undefined value if n == 0. FindLSBSetNonZero() is similar to ffs() except
// that it's 0-indexed.
static int FindLSBSetNonZero(uint32 n);
static int FindLSBSetNonZero(uint32_t n);
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
static int FindLSBSetNonZero64(uint64 n);
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
static int FindLSBSetNonZero64(uint64_t n);
private:
// No copying
@ -386,9 +264,9 @@ class Bits {
void operator=(const Bits&);
};
#ifdef HAVE_BUILTIN_CTZ
#if defined(HAVE_BUILTIN_CTZ)
inline int Bits::Log2FloorNonZero(uint32 n) {
inline int Bits::Log2FloorNonZero(uint32_t n) {
assert(n != 0);
// (31 ^ x) is equivalent to (31 - x) for x in [0, 31]. An easy proof
// represents subtraction in base 2 and observes that there's no carry.
@ -399,66 +277,52 @@ inline int Bits::Log2FloorNonZero(uint32 n) {
return 31 ^ __builtin_clz(n);
}
inline int Bits::Log2Floor(uint32 n) {
inline int Bits::Log2Floor(uint32_t n) {
return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
}
inline int Bits::FindLSBSetNonZero(uint32 n) {
inline int Bits::FindLSBSetNonZero(uint32_t n) {
assert(n != 0);
return __builtin_ctz(n);
}
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
assert(n != 0);
return __builtin_ctzll(n);
}
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#elif defined(_MSC_VER)
inline int Bits::Log2FloorNonZero(uint32 n) {
inline int Bits::Log2FloorNonZero(uint32_t n) {
assert(n != 0);
// NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
unsigned long where;
_BitScanReverse(&where, n);
return static_cast<int>(where);
}
inline int Bits::Log2Floor(uint32 n) {
inline int Bits::Log2Floor(uint32_t n) {
// NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
unsigned long where;
if (_BitScanReverse(&where, n))
return static_cast<int>(where);
return -1;
}
inline int Bits::FindLSBSetNonZero(uint32 n) {
inline int Bits::FindLSBSetNonZero(uint32_t n) {
assert(n != 0);
// NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
unsigned long where;
if (_BitScanForward(&where, n))
return static_cast<int>(where);
return 32;
}
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
assert(n != 0);
unsigned long where;
if (_BitScanForward64(&where, n))
return static_cast<int>(where);
return 64;
}
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#else // Portable versions.
inline int Bits::Log2FloorNonZero(uint32 n) {
inline int Bits::Log2FloorNonZero(uint32_t n) {
assert(n != 0);
int log = 0;
uint32 value = n;
uint32_t value = n;
for (int i = 4; i >= 0; --i) {
int shift = (1 << i);
uint32 x = value >> shift;
uint32_t x = value >> shift;
if (x != 0) {
value = x;
log += shift;
@ -468,16 +332,16 @@ inline int Bits::Log2FloorNonZero(uint32 n) {
return log;
}
inline int Bits::Log2Floor(uint32 n) {
inline int Bits::Log2Floor(uint32_t n) {
return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
}
inline int Bits::FindLSBSetNonZero(uint32 n) {
inline int Bits::FindLSBSetNonZero(uint32_t n) {
assert(n != 0);
int rc = 31;
for (int i = 4, shift = 1 << 4; i >= 0; --i) {
const uint32 x = n << shift;
const uint32_t x = n << shift;
if (x != 0) {
n = x;
rc -= shift;
@ -487,27 +351,48 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return rc;
}
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#endif // End portable versions.
#if defined(HAVE_BUILTIN_CTZ)
inline int Bits::FindLSBSetNonZero64(uint64_t n) {
assert(n != 0);
return __builtin_ctzll(n);
}
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
// _BitScanForward64() is only available on x64 and ARM64.
inline int Bits::FindLSBSetNonZero64(uint64_t n) {
assert(n != 0);
// NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
unsigned long where;
if (_BitScanForward64(&where, n))
return static_cast<int>(where);
return 64;
}
#else // Portable version.
// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
inline int Bits::FindLSBSetNonZero64(uint64 n) {
inline int Bits::FindLSBSetNonZero64(uint64_t n) {
assert(n != 0);
const uint32 bottombits = static_cast<uint32>(n);
const uint32_t bottombits = static_cast<uint32_t>(n);
if (bottombits == 0) {
// Bottom bits are zero, so scan in top bits
return 32 + FindLSBSetNonZero(static_cast<uint32>(n >> 32));
// Bottom bits are zero, so scan the top bits.
return 32 + FindLSBSetNonZero(static_cast<uint32_t>(n >> 32));
} else {
return FindLSBSetNonZero(bottombits);
}
}
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#endif // End portable versions.
#endif // End portable version.
// Variable-length integer encoding.
class Varint {
public:
// Maximum lengths of varint encoding of uint32.
// Maximum lengths of varint encoding of uint32_t.
static const int kMax32 = 5;
// Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
@ -516,23 +401,23 @@ class Varint {
// past the last byte of the varint32. Else returns NULL. On success,
// "result <= limit".
static const char* Parse32WithLimit(const char* ptr, const char* limit,
uint32* OUTPUT);
uint32_t* OUTPUT);
// REQUIRES "ptr" points to a buffer of length sufficient to hold "v".
// EFFECTS Encodes "v" into "ptr" and returns a pointer to the
// byte just past the last encoded byte.
static char* Encode32(char* ptr, uint32 v);
static char* Encode32(char* ptr, uint32_t v);
// EFFECTS Appends the varint representation of "value" to "*s".
static void Append32(std::string* s, uint32 value);
static void Append32(std::string* s, uint32_t value);
};
inline const char* Varint::Parse32WithLimit(const char* p,
const char* l,
uint32* OUTPUT) {
uint32_t* OUTPUT) {
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
const unsigned char* limit = reinterpret_cast<const unsigned char*>(l);
uint32 b, result;
uint32_t b, result;
if (ptr >= limit) return NULL;
b = *(ptr++); result = b & 127; if (b < 128) goto done;
if (ptr >= limit) return NULL;
@ -549,30 +434,30 @@ inline const char* Varint::Parse32WithLimit(const char* p,
return reinterpret_cast<const char*>(ptr);
}
inline char* Varint::Encode32(char* sptr, uint32 v) {
inline char* Varint::Encode32(char* sptr, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(sptr);
static const int B = 128;
if (v < (1<<7)) {
*(ptr++) = v;
} else if (v < (1<<14)) {
*(ptr++) = v | B;
*(ptr++) = v>>7;
} else if (v < (1<<21)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = v>>14;
} else if (v < (1<<28)) {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = v>>21;
uint8_t* ptr = reinterpret_cast<uint8_t*>(sptr);
static const uint8_t B = 128;
if (v < (1 << 7)) {
*(ptr++) = static_cast<uint8_t>(v);
} else if (v < (1 << 14)) {
*(ptr++) = static_cast<uint8_t>(v | B);
*(ptr++) = static_cast<uint8_t>(v >> 7);
} else if (v < (1 << 21)) {
*(ptr++) = static_cast<uint8_t>(v | B);
*(ptr++) = static_cast<uint8_t>((v >> 7) | B);
*(ptr++) = static_cast<uint8_t>(v >> 14);
} else if (v < (1 << 28)) {
*(ptr++) = static_cast<uint8_t>(v | B);
*(ptr++) = static_cast<uint8_t>((v >> 7) | B);
*(ptr++) = static_cast<uint8_t>((v >> 14) | B);
*(ptr++) = static_cast<uint8_t>(v >> 21);
} else {
*(ptr++) = v | B;
*(ptr++) = (v>>7) | B;
*(ptr++) = (v>>14) | B;
*(ptr++) = (v>>21) | B;
*(ptr++) = v>>28;
*(ptr++) = static_cast<uint8_t>(v | B);
*(ptr++) = static_cast<uint8_t>((v>>7) | B);
*(ptr++) = static_cast<uint8_t>((v>>14) | B);
*(ptr++) = static_cast<uint8_t>((v>>21) | B);
*(ptr++) = static_cast<uint8_t>(v >> 28);
}
return reinterpret_cast<char*>(ptr);
}

View file

@ -35,31 +35,28 @@
#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
#define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
#include <string>
#include <cstddef>
#if 0 // HAVE_SYS_UIO_H
#include <sys/uio.h>
#endif // HAVE_SYS_UIO_H
#define SNAPPY_MAJOR 1
#define SNAPPY_MINOR 1
#define SNAPPY_PATCHLEVEL 8
#define SNAPPY_VERSION \
((SNAPPY_MAJOR << 16) | (SNAPPY_MINOR << 8) | SNAPPY_PATCHLEVEL)
namespace snappy {
typedef int8_t int8;
typedef uint8_t uint8;
typedef int16_t int16;
typedef uint16_t uint16;
typedef int32_t int32;
typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
typedef std::string string;
#ifndef DISALLOW_COPY_AND_ASSIGN
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&)
#endif
#if !0 // !HAVE_SYS_UIO_H
// Windows does not have an iovec type, yet the concept is universally useful.
// It is simple to define it ourselves, so we put it inside our own namespace.
struct iovec {
void* iov_base;
size_t iov_len;
};
#endif // !HAVE_SYS_UIO_H
} // namespace snappy

File diff suppressed because it is too large Load diff

View file

@ -39,7 +39,9 @@
#ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__
#define THIRD_PARTY_SNAPPY_SNAPPY_H__
#include <cstddef>
#include <stddef.h>
#include <stdint.h>
#include <string>
#include "snappy-stubs-public.h"
@ -63,7 +65,7 @@ namespace snappy {
// Also note that this leaves "*source" in a state that is unsuitable for
// further operations, such as RawUncompress(). You will need to rewind
// or recreate the source yourself before attempting any further calls.
bool GetUncompressedLength(Source* source, uint32* result);
bool GetUncompressedLength(Source* source, uint32_t* result);
// ------------------------------------------------------------------------
// Higher-level string based routines (should be sufficient for most users)