SaveState: Update to Snappy 453942b.

2025-04-02 11:01:50 -04:00 · 2021-02-22 22:24:04 -08:00 · 2021-02-22 22:24:04 -08:00 · 7a9cbdfd6a
commit 7a9cbdfd6a
parent 39198b53a5
8 changed files with 1276 additions and 757 deletions
--- a/ext/snappy/snappy-internal.h
+++ b/ext/snappy/snappy-internal.h
@ -46,16 +46,16 @@ class WorkingMemory {
  // Allocates and clears a hash table using memory in "*this",
  // stores the number of buckets in "*table_size" and returns a pointer to
  // the base of the hash table.
-  uint16* GetHashTable(size_t fragment_size, int* table_size) const;
+  uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
  char* GetScratchInput() const { return input_; }
  char* GetScratchOutput() const { return output_; }

 private:
-  char* mem_;      // the allocated memory, never nullptr
-  size_t size_;    // the size of the allocated memory, never 0
-  uint16* table_;  // the pointer to the hashtable
-  char* input_;    // the pointer to the input scratch buffer
-  char* output_;   // the pointer to the output scratch buffer
+  char* mem_;        // the allocated memory, never nullptr
+  size_t size_;      // the size of the allocated memory, never 0
+  uint16_t* table_;  // the pointer to the hashtable
+  char* input_;      // the pointer to the input scratch buffer
+  char* output_;     // the pointer to the output scratch buffer

  // No copying
  WorkingMemory(const WorkingMemory&);
@ -76,7 +76,7 @@ class WorkingMemory {
 char* CompressFragment(const char* input,
                       size_t input_length,
                       char* op,
-                       uint16* table,
+                       uint16_t* table,
                       const int table_size);

 // Find the largest n such that
@ -89,12 +89,18 @@ char* CompressFragment(const char* input,
 // Does not read *(s1 + (s2_limit - s2)) or beyond.
 // Requires that s2_limit >= s2.
 //
+// In addition populate *data with the next 5 bytes from the end of the match.
+// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
+// that on some arch's this can be done faster in this routine than subsequent
+// loading from s2 + n.
+//
 // Separate implementation for 64-bit, little-endian cpus.
 #if !defined(SNAPPY_IS_BIG_ENDIAN) && \
-    (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
+    (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || defined(ARCH_ARM))
 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
                                                      const char* s2,
-                                                      const char* s2_limit) {
+                                                      const char* s2_limit,
+                                                      uint64_t* data) {
  assert(s2_limit >= s2);
  size_t matched = 0;

@ -103,12 +109,71 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
  // uncommon code paths that determine, without extra effort, whether the match
  // length is less than 8.  In short, we are hoping to avoid a conditional
  // branch, and perhaps get better code layout from the C++ compiler.
-  if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
-    uint64 a1 = UNALIGNED_LOAD64(s1);
-    uint64 a2 = UNALIGNED_LOAD64(s2);
-    if (a1 != a2) {
-      return std::pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3,
-                                     true);
+  if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
+    uint64_t a1 = UNALIGNED_LOAD64(s1);
+    uint64_t a2 = UNALIGNED_LOAD64(s2);
+    if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
+      // This code is critical for performance. The reason is that it determines
+      // how much to advance `ip` (s2). This obviously depends on both the loads
+      // from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
+      // depends on the advanced `ip` calculated here through a load, hash and
+      // new candidate hash lookup (a lot of cycles). This makes s1 (ie.
+      // `candidate`) the variable that limits throughput. This is the reason we
+      // go through hoops to have this function update `data` for the next iter.
+      // The straightforward code would use *data, given by
+      //
+      // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
+      //
+      // as input for the hash table lookup to find next candidate. However
+      // this forces the load on the data dependency chain of s1, because
+      // matched_bytes directly depends on s1. However matched_bytes is 0..7, so
+      // we can also calculate *data by
+      //
+      // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
+      //                    matched_bytes);
+      //
+      // The loads do not depend on s1 anymore and are thus off the bottleneck.
+      // The straightforward implementation on x86_64 would be to use
+      //
+      // shrd rax, rdx, cl  (cl being matched_bytes * 8)
+      //
+      // unfortunately shrd with a variable shift has a 4 cycle latency. So this
+      // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
+      // shift instruction but can only shift 64 bits. If we focus on just
+      // obtaining the least significant 4 bytes, we can obtain this by
+      //
+      // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
+      //     UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
+      //
+      // Writen like above this is not a big win, the conditional move would be
+      // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
+      // However matched_bytes < 4 is equal to
+      // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
+      // move (2 cycles) can execute in parallel with FindLSBSetNonZero64
+      // (tzcnt), which takes 3 cycles.
+      uint64_t xorval = a1 ^ a2;
+      int shift = Bits::FindLSBSetNonZero64(xorval);
+      size_t matched_bytes = shift >> 3;
+#ifndef __x86_64__
+      *data = UNALIGNED_LOAD64(s2 + matched_bytes);
+#else
+      // Ideally this would just be
+      //
+      // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
+      //
+      // However clang correctly infers that the above statement participates on
+      // a critical data dependency chain and thus, unfortunately, refuses to
+      // use a conditional move (it's tuned to cut data dependencies). In this
+      // case there is a longer parallel chain anyway AND this will be fairly
+      // unpredictable.
+      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
+      asm("testl %k2, %k2\n\t"
+          "cmovzq %1, %0\n\t"
+          : "+r"(a2)
+          : "r"(a3), "r"(xorval));
+      *data = a2 >> (shift & (3 * 8));
+#endif
+      return std::pair<size_t, bool>(matched_bytes, true);
    } else {
      matched = 8;
      s2 += 8;
@ -119,14 +184,27 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
  // time until we find a 64-bit block that doesn't match; then we find
  // the first non-matching bit and use that to calculate the total
  // length of the match.
-  while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 8)) {
-    if (UNALIGNED_LOAD64(s2) == UNALIGNED_LOAD64(s1 + matched)) {
+  while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
+    uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
+    uint64_t a2 = UNALIGNED_LOAD64(s2);
+    if (a1 == a2) {
      s2 += 8;
      matched += 8;
    } else {
-      uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
-      int matching_bits = Bits::FindLSBSetNonZero64(x);
-      matched += matching_bits >> 3;
+      uint64_t xorval = a1 ^ a2;
+      int shift = Bits::FindLSBSetNonZero64(xorval);
+      size_t matched_bytes = shift >> 3;
+#ifndef __x86_64__
+      *data = UNALIGNED_LOAD64(s2 + matched_bytes);
+#else
+      uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
+      asm("testl %k2, %k2\n\t"
+          "cmovzq %1, %0\n\t"
+          : "+r"(a2)
+          : "r"(a3), "r"(xorval));
+      *data = a2 >> (shift & (3 * 8));
+#endif
+      matched += matched_bytes;
      assert(matched >= 8);
      return std::pair<size_t, bool>(matched, false);
    }
@ -136,6 +214,9 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
      ++s2;
      ++matched;
    } else {
+      if (s2 <= s2_limit - 8) {
+        *data = UNALIGNED_LOAD64(s2);
+      }
      return std::pair<size_t, bool>(matched, matched < 8);
    }
  }
@ -144,7 +225,8 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
 #else
 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
                                                      const char* s2,
-                                                      const char* s2_limit) {
+                                                      const char* s2_limit,
+                                                      uint64_t* data) {
  // Implementation based on the x86-64 version, above.
  assert(s2_limit >= s2);
  int matched = 0;
@ -155,15 +237,17 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
    matched += 4;
  }
  if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
-    uint32 x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
+    uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
    int matching_bits = Bits::FindLSBSetNonZero(x);
    matched += matching_bits >> 3;
+    s2 += matching_bits >> 3;
  } else {
    while ((s2 < s2_limit) && (s1[matched] == *s2)) {
      ++s2;
      ++matched;
    }
  }
+  if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
  return std::pair<size_t, bool>(matched, matched < 8);
 }
 #endif
@ -190,7 +274,8 @@ static const int kMaximumTagLength = 5;  // COPY_4_BYTE_OFFSET plus the actual o
 // because of efficiency reasons:
 //      (1) Extracting a byte is faster than a bit-field
 //      (2) It properly aligns copy offset so we do not need a <<8
-static const uint16 char_table[256] = {
+static constexpr uint16_t char_table[256] = {
+    // clang-format off
  0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
  0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
  0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
@ -222,7 +307,8 @@ static const uint16 char_table[256] = {
  0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
  0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
  0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
-  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
+    // clang-format on
 };

 }  // end namespace internal
--- a/ext/snappy/snappy-sinksource.cpp
+++ b/ext/snappy/snappy-sinksource.cpp
@ -26,23 +26,31 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include <string.h>
+#include <stddef.h>
+#include <cstring>

 #include "snappy-sinksource.h"

 namespace snappy {

-Source::~Source() { }
+Source::~Source() = default;

-Sink::~Sink() { }
+Sink::~Sink() = default;

 char* Sink::GetAppendBuffer(size_t length, char* scratch) {
+  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
+  (void)length;
+
  return scratch;
 }

 char* Sink::GetAppendBufferVariable(
      size_t min_size, size_t desired_size_hint, char* scratch,
      size_t scratch_size, size_t* allocated_size) {
+  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
+  (void)min_size;
+  (void)desired_size_hint;
+
  *allocated_size = scratch_size;
  return scratch;
 }
@ -55,7 +63,7 @@ void Sink::AppendAndTakeOwnership(
  (*deleter)(deleter_arg, bytes, n);
 }

-ByteArraySource::~ByteArraySource() { }
+ByteArraySource::~ByteArraySource() = default;

 size_t ByteArraySource::Available() const { return left_; }

@ -74,22 +82,26 @@ UncheckedByteArraySink::~UncheckedByteArraySink() { }
 void UncheckedByteArraySink::Append(const char* data, size_t n) {
  // Do no copying if the caller filled in the result of GetAppendBuffer()
  if (data != dest_) {
-    memcpy(dest_, data, n);
+    std::memcpy(dest_, data, n);
  }
  dest_ += n;
 }

 char* UncheckedByteArraySink::GetAppendBuffer(size_t len, char* scratch) {
+  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
+  (void)len;
+  (void)scratch;
+
  return dest_;
 }

 void UncheckedByteArraySink::AppendAndTakeOwnership(
-    char* data, size_t n,
+    char* bytes, size_t n,
    void (*deleter)(void*, const char*, size_t),
    void *deleter_arg) {
-  if (data != dest_) {
-    memcpy(dest_, data, n);
-    (*deleter)(deleter_arg, data, n);
+  if (bytes != dest_) {
+    std::memcpy(dest_, bytes, n);
+    (*deleter)(deleter_arg, bytes, n);
  }
  dest_ += n;
 }
@ -97,6 +109,11 @@ void UncheckedByteArraySink::AppendAndTakeOwnership(
 char* UncheckedByteArraySink::GetAppendBufferVariable(
      size_t min_size, size_t desired_size_hint, char* scratch,
      size_t scratch_size, size_t* allocated_size) {
+  // TODO: Switch to [[maybe_unused]] when we can assume C++17.
+  (void)min_size;
+  (void)scratch;
+  (void)scratch_size;
+
  *allocated_size = desired_size_hint;
  return dest_;
 }
--- a/ext/snappy/snappy-sinksource.h
+++ b/ext/snappy/snappy-sinksource.h
@ -146,10 +146,10 @@ class Source {
 class ByteArraySource : public Source {
 public:
  ByteArraySource(const char* p, size_t n) : ptr_(p), left_(n) { }
-  virtual ~ByteArraySource();
-  virtual size_t Available() const;
-  virtual const char* Peek(size_t* len);
-  virtual void Skip(size_t n);
+  ~ByteArraySource() override;
+  size_t Available() const override;
+  const char* Peek(size_t* len) override;
+  void Skip(size_t n) override;
 private:
  const char* ptr_;
  size_t left_;
@ -159,15 +159,15 @@ class ByteArraySource : public Source {
 class UncheckedByteArraySink : public Sink {
 public:
  explicit UncheckedByteArraySink(char* dest) : dest_(dest) { }
-  virtual ~UncheckedByteArraySink();
-  virtual void Append(const char* data, size_t n);
-  virtual char* GetAppendBuffer(size_t len, char* scratch);
-  virtual char* GetAppendBufferVariable(
+  ~UncheckedByteArraySink() override;
+  void Append(const char* data, size_t n) override;
+  char* GetAppendBuffer(size_t len, char* scratch) override;
+  char* GetAppendBufferVariable(
      size_t min_size, size_t desired_size_hint, char* scratch,
-      size_t scratch_size, size_t* allocated_size);
-  virtual void AppendAndTakeOwnership(
+      size_t scratch_size, size_t* allocated_size) override;
+  void AppendAndTakeOwnership(
      char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
-      void *deleter_arg);
+      void *deleter_arg) override;

  // Return the current output pointer so that a caller can see how
  // many bytes were produced.
--- a/ext/snappy/snappy-stubs-internal.cpp
+++ b/ext/snappy/snappy-stubs-internal.cpp
@ -33,7 +33,7 @@

 namespace snappy {

-void Varint::Append32(std::string* s, uint32 value) {
+void Varint::Append32(std::string* s, uint32_t value) {
  char buf[Varint::kMax32];
  const char* p = Varint::Encode32(buf, value);
  s->append(buf, p - buf);
--- a/ext/snappy/snappy-stubs-internal.h
+++ b/ext/snappy/snappy-stubs-internal.h
@ -35,11 +35,13 @@
 #include "config.h"
 #endif

-#include <string>
+#include <stdint.h>

-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <string>

 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
@ -67,19 +69,11 @@

 #include "snappy-stubs-public.h"

-#if defined(__x86_64__)
-
-// Enable 64-bit optimized versions of some routines.
-#define ARCH_K8 1
-
-#elif defined(__ppc64__)
-
+// Used to enable 64-bit optimized versions of some routines.
+#if defined(__PPC64__) || defined(__powerpc64__)
 #define ARCH_PPC 1
-
-#elif defined(__aarch64__)
-
+#elif defined(__aarch64__) || defined(_M_ARM64)
 #define ARCH_ARM 1
-
 #endif

 // Needed by OS X, among others.
@ -93,7 +87,7 @@
 #ifdef ARRAYSIZE
 #undef ARRAYSIZE
 #endif
-#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
+#define ARRAYSIZE(a) int{sizeof(a) / sizeof(*(a))}

 // Static prediction hints.
 #ifdef HAVE_BUILTIN_EXPECT
@ -104,6 +98,13 @@
 #define SNAPPY_PREDICT_TRUE(x) x
 #endif

+// Inlining hints.
+#ifdef HAVE_ATTRIBUTE_ALWAYS_INLINE
+#define SNAPPY_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define SNAPPY_ATTRIBUTE_ALWAYS_INLINE
+#endif
+
 // This is only used for recomputing the tag byte table used during
 // decompression; for simplicity we just remove it from the open-source
 // version (anyone who wants to regenerate it can just do the call
@ -115,201 +116,47 @@

 namespace snappy {

-static const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
-static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
+static const uint32_t kuint32max = std::numeric_limits<uint32_t>::max();
+static const int64_t kint64max = std::numeric_limits<int64_t>::max();

 // Potentially unaligned loads and stores.

-// x86, PowerPC, and ARM64 can simply do these loads and stores native.
-
-#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \
-    defined(__aarch64__)
-
-#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
-#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
-#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
-
-#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
-#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
-#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
-
-// ARMv7 and newer support native unaligned accesses, but only of 16-bit
-// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
-// do an unaligned read and rotate the words around a bit, or do the reads very
-// slowly (trip through kernel mode). There's no simple #define that says just
-// "ARMv7 or higher", so we have to filter away all ARMv5 and ARMv6
-// sub-architectures.
-//
-// This is a mess, but there's not much we can do about it.
-//
-// To further complicate matters, only LDR instructions (single reads) are
-// allowed to be unaligned, not LDRD (two reads) or LDM (many reads). Unless we
-// explicitly tell the compiler that these accesses can be unaligned, it can and
-// will combine accesses. On armcc, the way to signal this is done by accessing
-// through the type (uint32 __packed *), but GCC has no such attribute
-// (it ignores __attribute__((packed)) on individual variables). However,
-// we can tell it that a _struct_ is unaligned, which has the same effect,
-// so we do that.
-
-#elif defined(__arm__) && \
-      !defined(__ARM_ARCH_4__) && \
-      !defined(__ARM_ARCH_4T__) && \
-      !defined(__ARM_ARCH_5__) && \
-      !defined(__ARM_ARCH_5T__) && \
-      !defined(__ARM_ARCH_5TE__) && \
-      !defined(__ARM_ARCH_5TEJ__) && \
-      !defined(__ARM_ARCH_6__) && \
-      !defined(__ARM_ARCH_6J__) && \
-      !defined(__ARM_ARCH_6K__) && \
-      !defined(__ARM_ARCH_6Z__) && \
-      !defined(__ARM_ARCH_6ZK__) && \
-      !defined(__ARM_ARCH_6T2__)
-
-#if __GNUC__
-#define ATTRIBUTE_PACKED __attribute__((__packed__))
-#else
-#define ATTRIBUTE_PACKED
-#endif
-
-namespace base {
-namespace internal {
-
-struct Unaligned16Struct {
-  uint16 value;
-  uint8 dummy;  // To make the size non-power-of-two.
-} ATTRIBUTE_PACKED;
-
-struct Unaligned32Struct {
-  uint32 value;
-  uint8 dummy;  // To make the size non-power-of-two.
-} ATTRIBUTE_PACKED;
-
-}  // namespace internal
-}  // namespace base
-
-#define UNALIGNED_LOAD16(_p) \
-    ((reinterpret_cast<const ::snappy::base::internal::Unaligned16Struct *>(_p))->value)
-#define UNALIGNED_LOAD32(_p) \
-    ((reinterpret_cast<const ::snappy::base::internal::Unaligned32Struct *>(_p))->value)
-
-#define UNALIGNED_STORE16(_p, _val) \
-    ((reinterpret_cast< ::snappy::base::internal::Unaligned16Struct *>(_p))->value = \
-         (_val))
-#define UNALIGNED_STORE32(_p, _val) \
-    ((reinterpret_cast< ::snappy::base::internal::Unaligned32Struct *>(_p))->value = \
-         (_val))
-
-// TODO: NEON supports unaligned 64-bit loads and stores.
-// See if that would be more efficient on platforms supporting it,
-// at least for copies.
-
-inline uint64 UNALIGNED_LOAD64(const void *p) {
-  uint64 t;
-  memcpy(&t, p, sizeof t);
-  return t;
+inline uint16_t UNALIGNED_LOAD16(const void *p) {
+  // Compiles to a single movzx/ldrh on clang/gcc/msvc.
+  uint16_t v;
+  std::memcpy(&v, p, sizeof(v));
+  return v;
 }

-inline void UNALIGNED_STORE64(void *p, uint64 v) {
-  memcpy(p, &v, sizeof v);
+inline uint32_t UNALIGNED_LOAD32(const void *p) {
+  // Compiles to a single mov/ldr on clang/gcc/msvc.
+  uint32_t v;
+  std::memcpy(&v, p, sizeof(v));
+  return v;
 }

-#else
-
-// These functions are provided for architectures that don't support
-// unaligned loads and stores.
-
-inline uint16 UNALIGNED_LOAD16(const void *p) {
-  uint16 t;
-  memcpy(&t, p, sizeof t);
-  return t;
+inline uint64_t UNALIGNED_LOAD64(const void *p) {
+  // Compiles to a single mov/ldr on clang/gcc/msvc.
+  uint64_t v;
+  std::memcpy(&v, p, sizeof(v));
+  return v;
 }

-inline uint32 UNALIGNED_LOAD32(const void *p) {
-  uint32 t;
-  memcpy(&t, p, sizeof t);
-  return t;
+inline void UNALIGNED_STORE16(void *p, uint16_t v) {
+  // Compiles to a single mov/strh on clang/gcc/msvc.
+  std::memcpy(p, &v, sizeof(v));
 }

-inline uint64 UNALIGNED_LOAD64(const void *p) {
-  uint64 t;
-  memcpy(&t, p, sizeof t);
-  return t;
+inline void UNALIGNED_STORE32(void *p, uint32_t v) {
+  // Compiles to a single mov/str on clang/gcc/msvc.
+  std::memcpy(p, &v, sizeof(v));
 }

-inline void UNALIGNED_STORE16(void *p, uint16 v) {
-  memcpy(p, &v, sizeof v);
+inline void UNALIGNED_STORE64(void *p, uint64_t v) {
+  // Compiles to a single mov/str on clang/gcc/msvc.
+  std::memcpy(p, &v, sizeof(v));
 }

-inline void UNALIGNED_STORE32(void *p, uint32 v) {
-  memcpy(p, &v, sizeof v);
-}
-
-inline void UNALIGNED_STORE64(void *p, uint64 v) {
-  memcpy(p, &v, sizeof v);
-}
-
-#endif
-
-// The following guarantees declaration of the byte swap functions.
-#if defined(SNAPPY_IS_BIG_ENDIAN)
-
-#ifdef HAVE_SYS_BYTEORDER_H
-#include <sys/byteorder.h>
-#endif
-
-#ifdef HAVE_SYS_ENDIAN_H
-#include <sys/endian.h>
-#endif
-
-#ifdef _MSC_VER
-#include <stdlib.h>
-#define bswap_16(x) _byteswap_ushort(x)
-#define bswap_32(x) _byteswap_ulong(x)
-#define bswap_64(x) _byteswap_uint64(x)
-
-#elif defined(__APPLE__)
-// Mac OS X / Darwin features
-#include <libkern/OSByteOrder.h>
-#define bswap_16(x) OSSwapInt16(x)
-#define bswap_32(x) OSSwapInt32(x)
-#define bswap_64(x) OSSwapInt64(x)
-
-#elif defined(HAVE_BYTESWAP_H)
-#include <byteswap.h>
-
-#elif defined(bswap32)
-// FreeBSD defines bswap{16,32,64} in <sys/endian.h> (already #included).
-#define bswap_16(x) bswap16(x)
-#define bswap_32(x) bswap32(x)
-#define bswap_64(x) bswap64(x)
-
-#elif defined(BSWAP_64)
-// Solaris 10 defines BSWAP_{16,32,64} in <sys/byteorder.h> (already #included).
-#define bswap_16(x) BSWAP_16(x)
-#define bswap_32(x) BSWAP_32(x)
-#define bswap_64(x) BSWAP_64(x)
-
-#else
-
-inline uint16 bswap_16(uint16 x) {
-  return (x << 8) | (x >> 8);
-}
-
-inline uint32 bswap_32(uint32 x) {
-  x = ((x & 0xff00ff00UL) >> 8) | ((x & 0x00ff00ffUL) << 8);
-  return (x >> 16) | (x << 16);
-}
-
-inline uint64 bswap_64(uint64 x) {
-  x = ((x & 0xff00ff00ff00ff00ULL) >> 8) | ((x & 0x00ff00ff00ff00ffULL) << 8);
-  x = ((x & 0xffff0000ffff0000ULL) >> 16) | ((x & 0x0000ffff0000ffffULL) << 16);
-  return (x >> 32) | (x << 32);
-}
-
-#endif
-
-#endif  // defined(SNAPPY_IS_BIG_ENDIAN)
-
 // Convert to little-endian storage, opposite of network format.
 // Convert x from host to little endian: x = LittleEndian.FromHost(x);
 // convert x from little endian to host: x = LittleEndian.ToHost(x);
@ -321,44 +168,77 @@ inline uint64 bswap_64(uint64 x) {
 //    x = LittleEndian.Load16(p);
 class LittleEndian {
 public:
-  // Conversion functions.
-#if defined(SNAPPY_IS_BIG_ENDIAN)
-
-  static uint16 FromHost16(uint16 x) { return bswap_16(x); }
-  static uint16 ToHost16(uint16 x) { return bswap_16(x); }
-
-  static uint32 FromHost32(uint32 x) { return bswap_32(x); }
-  static uint32 ToHost32(uint32 x) { return bswap_32(x); }
-
-  static bool IsLittleEndian() { return false; }
-
-#else  // !defined(SNAPPY_IS_BIG_ENDIAN)
-
-  static uint16 FromHost16(uint16 x) { return x; }
-  static uint16 ToHost16(uint16 x) { return x; }
-
-  static uint32 FromHost32(uint32 x) { return x; }
-  static uint32 ToHost32(uint32 x) { return x; }
-
-  static bool IsLittleEndian() { return true; }
-
-#endif  // !defined(SNAPPY_IS_BIG_ENDIAN)
-
  // Functions to do unaligned loads and stores in little-endian order.
-  static uint16 Load16(const void *p) {
-    return ToHost16(UNALIGNED_LOAD16(p));
+  static inline uint16_t Load16(const void *ptr) {
+    const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
+    // Compiles to a single mov/str on recent clang and gcc.
+    return (static_cast<uint16_t>(buffer[0])) |
+            (static_cast<uint16_t>(buffer[1]) << 8);
  }

-  static void Store16(void *p, uint16 v) {
-    UNALIGNED_STORE16(p, FromHost16(v));
+  static inline uint32_t Load32(const void *ptr) {
+    const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
+    // Compiles to a single mov/str on recent clang and gcc.
+    return (static_cast<uint32_t>(buffer[0])) |
+            (static_cast<uint32_t>(buffer[1]) << 8) |
+            (static_cast<uint32_t>(buffer[2]) << 16) |
+            (static_cast<uint32_t>(buffer[3]) << 24);
  }

-  static uint32 Load32(const void *p) {
-    return ToHost32(UNALIGNED_LOAD32(p));
+  static inline uint64_t Load64(const void *ptr) {
+    const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
+    // Compiles to a single mov/str on recent clang and gcc.
+    return (static_cast<uint64_t>(buffer[0])) |
+            (static_cast<uint64_t>(buffer[1]) << 8) |
+            (static_cast<uint64_t>(buffer[2]) << 16) |
+            (static_cast<uint64_t>(buffer[3]) << 24) |
+            (static_cast<uint64_t>(buffer[4]) << 32) |
+            (static_cast<uint64_t>(buffer[5]) << 40) |
+            (static_cast<uint64_t>(buffer[6]) << 48) |
+            (static_cast<uint64_t>(buffer[7]) << 56);
  }

-  static void Store32(void *p, uint32 v) {
-    UNALIGNED_STORE32(p, FromHost32(v));
+  static inline void Store16(void *dst, uint16_t value) {
+    uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+    // Compiles to a single mov/str on recent clang and gcc.
+    buffer[0] = static_cast<uint8_t>(value);
+    buffer[1] = static_cast<uint8_t>(value >> 8);
+  }
+
+  static void Store32(void *dst, uint32_t value) {
+    uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+    // Compiles to a single mov/str on recent clang and gcc.
+    buffer[0] = static_cast<uint8_t>(value);
+    buffer[1] = static_cast<uint8_t>(value >> 8);
+    buffer[2] = static_cast<uint8_t>(value >> 16);
+    buffer[3] = static_cast<uint8_t>(value >> 24);
+  }
+
+  static void Store64(void* dst, uint64_t value) {
+    uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+    // Compiles to a single mov/str on recent clang and gcc.
+    buffer[0] = static_cast<uint8_t>(value);
+    buffer[1] = static_cast<uint8_t>(value >> 8);
+    buffer[2] = static_cast<uint8_t>(value >> 16);
+    buffer[3] = static_cast<uint8_t>(value >> 24);
+    buffer[4] = static_cast<uint8_t>(value >> 32);
+    buffer[5] = static_cast<uint8_t>(value >> 40);
+    buffer[6] = static_cast<uint8_t>(value >> 48);
+    buffer[7] = static_cast<uint8_t>(value >> 56);
+  }
+
+  static inline constexpr bool IsLittleEndian() {
+#if defined(SNAPPY_IS_BIG_ENDIAN)
+    return false;
+#else
+    return true;
+#endif  // defined(SNAPPY_IS_BIG_ENDIAN)
  }
 };

@ -366,19 +246,17 @@ class LittleEndian {
 class Bits {
 public:
  // Return floor(log2(n)) for positive integer n.
-  static int Log2FloorNonZero(uint32 n);
+  static int Log2FloorNonZero(uint32_t n);

  // Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-  static int Log2Floor(uint32 n);
+  static int Log2Floor(uint32_t n);

  // Return the first set least / most significant bit, 0-indexed.  Returns an
  // undefined value if n == 0.  FindLSBSetNonZero() is similar to ffs() except
  // that it's 0-indexed.
-  static int FindLSBSetNonZero(uint32 n);
+  static int FindLSBSetNonZero(uint32_t n);

-#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
-  static int FindLSBSetNonZero64(uint64 n);
-#endif  // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
+  static int FindLSBSetNonZero64(uint64_t n);

 private:
  // No copying
@ -386,9 +264,9 @@ class Bits {
  void operator=(const Bits&);
 };

-#ifdef HAVE_BUILTIN_CTZ
+#if defined(HAVE_BUILTIN_CTZ)

-inline int Bits::Log2FloorNonZero(uint32 n) {
+inline int Bits::Log2FloorNonZero(uint32_t n) {
  assert(n != 0);
  // (31 ^ x) is equivalent to (31 - x) for x in [0, 31]. An easy proof
  // represents subtraction in base 2 and observes that there's no carry.
@ -399,66 +277,52 @@ inline int Bits::Log2FloorNonZero(uint32 n) {
  return 31 ^ __builtin_clz(n);
 }

-inline int Bits::Log2Floor(uint32 n) {
+inline int Bits::Log2Floor(uint32_t n) {
  return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
 }

-inline int Bits::FindLSBSetNonZero(uint32 n) {
+inline int Bits::FindLSBSetNonZero(uint32_t n) {
  assert(n != 0);
  return __builtin_ctz(n);
 }

-#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
-inline int Bits::FindLSBSetNonZero64(uint64 n) {
-  assert(n != 0);
-  return __builtin_ctzll(n);
-}
-#endif  // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
-
 #elif defined(_MSC_VER)

-inline int Bits::Log2FloorNonZero(uint32 n) {
+inline int Bits::Log2FloorNonZero(uint32_t n) {
  assert(n != 0);
+  // NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
  unsigned long where;
  _BitScanReverse(&where, n);
  return static_cast<int>(where);
 }

-inline int Bits::Log2Floor(uint32 n) {
+inline int Bits::Log2Floor(uint32_t n) {
+  // NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
  unsigned long where;
  if (_BitScanReverse(&where, n))
    return static_cast<int>(where);
  return -1;
 }

-inline int Bits::FindLSBSetNonZero(uint32 n) {
+inline int Bits::FindLSBSetNonZero(uint32_t n) {
  assert(n != 0);
+  // NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
  unsigned long where;
  if (_BitScanForward(&where, n))
    return static_cast<int>(where);
  return 32;
 }

-#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
-inline int Bits::FindLSBSetNonZero64(uint64 n) {
-  assert(n != 0);
-  unsigned long where;
-  if (_BitScanForward64(&where, n))
-    return static_cast<int>(where);
-  return 64;
-}
-#endif  // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
-
 #else  // Portable versions.

-inline int Bits::Log2FloorNonZero(uint32 n) {
+inline int Bits::Log2FloorNonZero(uint32_t n) {
  assert(n != 0);

  int log = 0;
-  uint32 value = n;
+  uint32_t value = n;
  for (int i = 4; i >= 0; --i) {
    int shift = (1 << i);
-    uint32 x = value >> shift;
+    uint32_t x = value >> shift;
    if (x != 0) {
      value = x;
      log += shift;
@ -468,16 +332,16 @@ inline int Bits::Log2FloorNonZero(uint32 n) {
  return log;
 }

-inline int Bits::Log2Floor(uint32 n) {
+inline int Bits::Log2Floor(uint32_t n) {
  return (n == 0) ? -1 : Bits::Log2FloorNonZero(n);
 }

-inline int Bits::FindLSBSetNonZero(uint32 n) {
+inline int Bits::FindLSBSetNonZero(uint32_t n) {
  assert(n != 0);

  int rc = 31;
  for (int i = 4, shift = 1 << 4; i >= 0; --i) {
-    const uint32 x = n << shift;
+    const uint32_t x = n << shift;
    if (x != 0) {
      n = x;
      rc -= shift;
@ -487,27 +351,48 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
  return rc;
 }

-#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
+#endif  // End portable versions.
+
+#if defined(HAVE_BUILTIN_CTZ)
+
+inline int Bits::FindLSBSetNonZero64(uint64_t n) {
+  assert(n != 0);
+  return __builtin_ctzll(n);
+}
+
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+// _BitScanForward64() is only available on x64 and ARM64.
+
+inline int Bits::FindLSBSetNonZero64(uint64_t n) {
+  assert(n != 0);
+  // NOLINTNEXTLINE(runtime/int): The MSVC intrinsic demands unsigned long.
+  unsigned long where;
+  if (_BitScanForward64(&where, n))
+    return static_cast<int>(where);
+  return 64;
+}
+
+#else  // Portable version.
+
 // FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
-inline int Bits::FindLSBSetNonZero64(uint64 n) {
+inline int Bits::FindLSBSetNonZero64(uint64_t n) {
  assert(n != 0);

-  const uint32 bottombits = static_cast<uint32>(n);
+  const uint32_t bottombits = static_cast<uint32_t>(n);
  if (bottombits == 0) {
-    // Bottom bits are zero, so scan in top bits
-    return 32 + FindLSBSetNonZero(static_cast<uint32>(n >> 32));
+    // Bottom bits are zero, so scan the top bits.
+    return 32 + FindLSBSetNonZero(static_cast<uint32_t>(n >> 32));
  } else {
    return FindLSBSetNonZero(bottombits);
  }
 }
-#endif  // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)

-#endif  // End portable versions.
+#endif  // End portable version.

 // Variable-length integer encoding.
 class Varint {
 public:
-  // Maximum lengths of varint encoding of uint32.
+  // Maximum lengths of varint encoding of uint32_t.
  static const int kMax32 = 5;

  // Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
@ -516,23 +401,23 @@ class Varint {
  // past the last byte of the varint32. Else returns NULL.  On success,
  // "result <= limit".
  static const char* Parse32WithLimit(const char* ptr, const char* limit,
-                                      uint32* OUTPUT);
+                                      uint32_t* OUTPUT);

  // REQUIRES   "ptr" points to a buffer of length sufficient to hold "v".
  // EFFECTS    Encodes "v" into "ptr" and returns a pointer to the
  //            byte just past the last encoded byte.
-  static char* Encode32(char* ptr, uint32 v);
+  static char* Encode32(char* ptr, uint32_t v);

  // EFFECTS    Appends the varint representation of "value" to "*s".
-  static void Append32(std::string* s, uint32 value);
+  static void Append32(std::string* s, uint32_t value);
 };

 inline const char* Varint::Parse32WithLimit(const char* p,
                                            const char* l,
-                                            uint32* OUTPUT) {
+                                            uint32_t* OUTPUT) {
  const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
  const unsigned char* limit = reinterpret_cast<const unsigned char*>(l);
-  uint32 b, result;
+  uint32_t b, result;
  if (ptr >= limit) return NULL;
  b = *(ptr++); result = b & 127;          if (b < 128) goto done;
  if (ptr >= limit) return NULL;
@ -549,30 +434,30 @@ inline const char* Varint::Parse32WithLimit(const char* p,
  return reinterpret_cast<const char*>(ptr);
 }

-inline char* Varint::Encode32(char* sptr, uint32 v) {
+inline char* Varint::Encode32(char* sptr, uint32_t v) {
  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(sptr);
-  static const int B = 128;
-  if (v < (1<<7)) {
-    *(ptr++) = v;
-  } else if (v < (1<<14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v>>7;
-  } else if (v < (1<<21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = v>>14;
-  } else if (v < (1<<28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = (v>>14) | B;
-    *(ptr++) = v>>21;
+  uint8_t* ptr = reinterpret_cast<uint8_t*>(sptr);
+  static const uint8_t B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = static_cast<uint8_t>(v);
+  } else if (v < (1 << 14)) {
+    *(ptr++) = static_cast<uint8_t>(v | B);
+    *(ptr++) = static_cast<uint8_t>(v >> 7);
+  } else if (v < (1 << 21)) {
+    *(ptr++) = static_cast<uint8_t>(v | B);
+    *(ptr++) = static_cast<uint8_t>((v >> 7) | B);
+    *(ptr++) = static_cast<uint8_t>(v >> 14);
+  } else if (v < (1 << 28)) {
+    *(ptr++) = static_cast<uint8_t>(v | B);
+    *(ptr++) = static_cast<uint8_t>((v >> 7) | B);
+    *(ptr++) = static_cast<uint8_t>((v >> 14) | B);
+    *(ptr++) = static_cast<uint8_t>(v >> 21);
  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v>>7) | B;
-    *(ptr++) = (v>>14) | B;
-    *(ptr++) = (v>>21) | B;
-    *(ptr++) = v>>28;
+    *(ptr++) = static_cast<uint8_t>(v | B);
+    *(ptr++) = static_cast<uint8_t>((v>>7) | B);
+    *(ptr++) = static_cast<uint8_t>((v>>14) | B);
+    *(ptr++) = static_cast<uint8_t>((v>>21) | B);
+    *(ptr++) = static_cast<uint8_t>(v >> 28);
  }
  return reinterpret_cast<char*>(ptr);
 }
--- a/ext/snappy/snappy-stubs-public.h
+++ b/ext/snappy/snappy-stubs-public.h
@ -35,31 +35,28 @@
 #ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_
 #define THIRD_PARTY_SNAPPY_OPENSOURCE_SNAPPY_STUBS_PUBLIC_H_

-#include <string>
+#include <cstddef>
+
+#if 0  // HAVE_SYS_UIO_H
+#include <sys/uio.h>
+#endif  // HAVE_SYS_UIO_H
+
+#define SNAPPY_MAJOR 1
+#define SNAPPY_MINOR 1
+#define SNAPPY_PATCHLEVEL 8
+#define SNAPPY_VERSION \
+    ((SNAPPY_MAJOR << 16) | (SNAPPY_MINOR << 8) | SNAPPY_PATCHLEVEL)

 namespace snappy {

-typedef int8_t int8;
-typedef uint8_t uint8;
-typedef int16_t int16;
-typedef uint16_t uint16;
-typedef int32_t int32;
-typedef uint32_t uint32;
-typedef int64_t int64;
-typedef uint64_t uint64;
-
-typedef std::string string;
-
-#ifndef DISALLOW_COPY_AND_ASSIGN
-#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);               \
-  void operator=(const TypeName&)
-#endif
-
+#if !0  // !HAVE_SYS_UIO_H
+// Windows does not have an iovec type, yet the concept is universally useful.
+// It is simple to define it ourselves, so we put it inside our own namespace.
 struct iovec {
 	void* iov_base;
 	size_t iov_len;
 };
+#endif  // !HAVE_SYS_UIO_H

 }  // namespace snappy

--- a/ext/snappy/snappy.cpp
+++ b/ext/snappy/snappy.cpp
--- a/ext/snappy/snappy.h
+++ b/ext/snappy/snappy.h
@ -39,7 +39,9 @@
 #ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__
 #define THIRD_PARTY_SNAPPY_SNAPPY_H__

-#include <cstddef>
+#include <stddef.h>
+#include <stdint.h>
+
 #include <string>

 #include "snappy-stubs-public.h"
@ -63,7 +65,7 @@ namespace snappy {
  // Also note that this leaves "*source" in a state that is unsuitable for
  // further operations, such as RawUncompress(). You will need to rewind
  // or recreate the source yourself before attempting any further calls.
-  bool GetUncompressedLength(Source* source, uint32* result);
+  bool GetUncompressedLength(Source* source, uint32_t* result);

  // ------------------------------------------------------------------------
  // Higher-level string based routines (should be sufficient for most users)