Iteration counts should be `uint64_t` globally. (#817)

This is a shameless rip-off of https://github.com/google/benchmark/pull/646
I did promise to look into why that proposed PR was producing
so much worse assembly, and so i finally did.

The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed).

There is this nice little `assert`:
https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744
It ensures that we didn't magically decide to advance our iterator
when we should have finished benchmarking.

When `cached_` was unsigned, the `assert` was `cached_ UGT 0`.
But we only ever get to that `assert` if `cached_ NE 0`,
and naturally if `cached_` is not `0`, then it is bigger than `0`,
so the `assert` is tautological, and gets folded away.

But now that `cached_` became signed, the assert became `cached_ SGT 0`.
And we still only know that `cached_ NE 0`, so the assert can't be
optimized out, or at least it doesn't currently.

Regardless of whether or not that is a bug in itself,
that particular diff would have regressed the normal 64-bit systems,
by halving the maximal iteration space (since we go from unsigned counter
to signed one, of the same bit-width), which seems like a bug.
And just so it happens, fixing *this* bug, fixes the other bug.

This produces fully (bit-by-bit) identical state_assembly_test.s
The filecheck change is actually needed regardless of this patch,
else this test does not pass for me even without this diff.
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 8329368..6cb96f5 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -56,8 +56,7 @@
   memset(src, 'x', state.range(0));
   for (auto _ : state)
     memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
   delete[] src; delete[] dst;
 }
 BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
@@ -122,8 +121,7 @@
       q.Wait(&v);
   }
   // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
 }
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
 
@@ -413,9 +411,11 @@
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
+typedef uint64_t IterationCount;
+
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
-typedef double(BigOFunc)(int64_t);
+typedef double(BigOFunc)(IterationCount);
 
 // StatisticsFunc is passed to a benchmark in order to compute some descriptive
 // statistics over all the measurements of some type
@@ -488,7 +488,7 @@
   //   while (state.KeepRunningBatch(1000)) {
   //     // process 1000 elements
   //   }
-  bool KeepRunningBatch(size_t n);
+  bool KeepRunningBatch(IterationCount n);
 
   // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
   //           by the current thread.
@@ -627,7 +627,7 @@
   int64_t range_y() const { return range(1); }
 
   BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const {
+  IterationCount iterations() const {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
       return 0;
     }
@@ -638,15 +638,15 @@
      :  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
-  size_t total_iterations_;
+  IterationCount total_iterations_;
 
   // When using KeepRunningBatch(), batch_leftover_ holds the number of
   // iterations beyond max_iters that were run. Used to track
   // completed_iterations_ accurately.
-  size_t batch_leftover_;
+  IterationCount batch_leftover_;
 
  public:
-  const size_t max_iterations;
+  const IterationCount max_iterations;
 
  private:
   bool started_;
@@ -667,14 +667,14 @@
   const int threads;
 
  private:
-  State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
-        int n_threads, internal::ThreadTimer* timer,
+  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+        int thread_i, int n_threads, internal::ThreadTimer* timer,
         internal::ThreadManager* manager);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
-  bool KeepRunningInternal(size_t n, bool is_batch);
+  bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
   internal::ThreadTimer* timer_;
   internal::ThreadManager* manager_;
@@ -686,11 +686,11 @@
   return KeepRunningInternal(1, /*is_batch=*/false);
 }
 
-inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(size_t n) {
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(IterationCount n) {
   return KeepRunningInternal(n, /*is_batch=*/true);
 }
 
-inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(size_t n,
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
                                                                bool is_batch) {
   // total_iterations_ is set to 0 by the constructor, and always set to a
   // nonzero value by StartKepRunning().
@@ -754,7 +754,7 @@
   }
 
  private:
-  size_t cached_;
+  IterationCount cached_;
   State* const parent_;
 };
 
@@ -858,7 +858,7 @@
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
   // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
-  Benchmark* Iterations(size_t n);
+  Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
   // the `benchmark_repetitions` flag.
@@ -957,7 +957,7 @@
   TimeUnit time_unit_;
   int range_multiplier_;
   double min_time_;
-  size_t iterations_;
+  IterationCount iterations_;
   int repetitions_;
   bool measure_process_cpu_time_;
   bool use_real_time_;
@@ -1375,7 +1375,7 @@
     bool error_occurred;
     std::string error_message;
 
-    int64_t iterations;
+    IterationCount iterations;
     int64_t threads;
     int64_t repetition_index;
     int64_t repetitions;
diff --git a/src/benchmark.cc b/src/benchmark.cc
index c76346c..29bfa35 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -121,8 +121,8 @@
 
 }  // namespace internal
 
-State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
-             int n_threads, internal::ThreadTimer* timer,
+State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+             int thread_i, int n_threads, internal::ThreadTimer* timer,
              internal::ThreadManager* manager)
     : total_iterations_(0),
       batch_leftover_(0),
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
index 8d31083..d468a25 100644
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@@ -3,9 +3,9 @@
 namespace benchmark {
 namespace internal {
 
-State BenchmarkInstance::Run(
-    size_t iters, int thread_id, internal::ThreadTimer* timer,
-    internal::ThreadManager* manager) const {
+State BenchmarkInstance::Run(IterationCount iters, int thread_id,
+                             internal::ThreadTimer* timer,
+                             internal::ThreadManager* manager) const {
   State st(iters, arg, thread_id, threads, timer, manager);
   benchmark->Run(st);
   return st;
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 19c6d4f..264eff9 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -32,10 +32,10 @@
   bool last_benchmark_instance;
   int repetitions;
   double min_time;
-  size_t iterations;
+  IterationCount iterations;
   int threads;  // Number of concurrent threads to us
 
-  State Run(size_t iters, int thread_id, internal::ThreadTimer* timer,
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager) const;
 };
 
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 3ffd734..6696c38 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -376,7 +376,7 @@
   return this;
 }
 
-Benchmark* Benchmark::Iterations(size_t n) {
+Benchmark* Benchmark::Iterations(IterationCount n) {
   CHECK(n > 0);
   CHECK(IsZero(min_time_));
   iterations_ = n;
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index b1c0b88..0bae6a5 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -59,11 +59,12 @@
 
 namespace {
 
-static const size_t kMaxIterations = 1000000000;
+static constexpr IterationCount kMaxIterations = 1000000000;
 
 BenchmarkReporter::Run CreateRunReport(
     const benchmark::internal::BenchmarkInstance& b,
-    const internal::ThreadManager::Result& results, size_t memory_iterations,
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
     const MemoryManager::Result& memory_result, double seconds,
     int64_t repetition_index) {
   // Create report about this benchmark run.
@@ -109,8 +110,8 @@
 
 // Execute one thread of benchmark b for the specified number of iterations.
 // Adds the stats collected for the thread into *total.
-void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
-                 ThreadManager* manager) {
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager) {
   internal::ThreadTimer timer(
       b->measure_process_cpu_time
           ? internal::ThreadTimer::CreateProcessCpuTime()
@@ -187,13 +188,13 @@
 
   std::vector<std::thread> pool;
 
-  size_t iters;  // preserved between repetitions!
+  IterationCount iters;  // preserved between repetitions!
   // So only the first repetition has to find/calculate it,
   // the other repetitions will just use that precomputed iteration count.
 
   struct IterationResults {
     internal::ThreadManager::Result results;
-    size_t iters;
+    IterationCount iters;
     double seconds;
   };
   IterationResults DoNIterations() {
@@ -248,7 +249,7 @@
     return i;
   }
 
-  size_t PredictNumItersNeeded(const IterationResults& i) const {
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
     // See how much iterations should be increased by.
     // Note: Avoid division by zero with max(seconds, 1ns).
     double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
@@ -262,10 +263,10 @@
     if (multiplier <= 1.0) multiplier = 2.0;
 
     // So what seems to be the sufficiently-large iteration count? Round up.
-    const size_t max_next_iters =
+    const IterationCount max_next_iters =
         0.5 + std::max(multiplier * i.iters, i.iters + 1.0);
     // But we do have *some* sanity limits though..
-    const size_t next_iters = std::min(max_next_iters, kMaxIterations);
+    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
 
     VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
     return next_iters;  // round up before conversion to integer.
@@ -319,11 +320,11 @@
 
     // Oh, one last thing, we need to also produce the 'memory measurements'..
     MemoryManager::Result memory_result;
-    size_t memory_iterations = 0;
+    IterationCount memory_iterations = 0;
     if (memory_manager != nullptr) {
       // Only run a few iterations to reduce the impact of one-time
       // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<size_t>(16, iters);
+      memory_iterations = std::min<IterationCount>(16, iters);
       memory_manager->Start();
       std::unique_ptr<internal::ThreadManager> manager;
       manager.reset(new internal::ThreadManager(1));
diff --git a/src/complexity.cc b/src/complexity.cc
index e65bd2e..aeed67f 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -29,20 +29,23 @@
   static const double kLog2E = 1.44269504088896340736;
   switch (complexity) {
     case oN:
-      return [](int64_t n) -> double { return static_cast<double>(n); };
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
     case oNSquared:
-      return [](int64_t n) -> double { return std::pow(n, 2); };
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
     case oNCubed:
-      return [](int64_t n) -> double { return std::pow(n, 3); };
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
     case oLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * log(static_cast<double>(n)); };
+      return
+          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
     case oNLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * n * log(static_cast<double>(n)); };
+      return [](IterationCount n) {
+        return kLog2E * n * log(static_cast<double>(n));
+      };
     case o1:
     default:
-      return [](int64_t) { return 1.0; };
+      return [](IterationCount) { return 1.0; };
   }
 }
 
diff --git a/src/counter.cc b/src/counter.cc
index cb604e0..c248ea1 100644
--- a/src/counter.cc
+++ b/src/counter.cc
@@ -17,7 +17,7 @@
 namespace benchmark {
 namespace internal {
 
-double Finish(Counter const& c, int64_t iterations, double cpu_time,
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
               double num_threads) {
   double v = c.value;
   if (c.flags & Counter::kIsRate) {
@@ -35,7 +35,8 @@
   return v;
 }
 
-void Finish(UserCounters* l, int64_t iterations, double cpu_time, double num_threads) {
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
   for (auto& c : *l) {
     c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
   }
diff --git a/src/counter.h b/src/counter.h
index d884e50..1ad46d4 100644
--- a/src/counter.h
+++ b/src/counter.h
@@ -18,7 +18,8 @@
 
 // these counter-related functions are hidden to reduce API surface.
 namespace internal {
-void Finish(UserCounters* l, int64_t iterations, double time, double num_threads);
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
 void Increment(UserCounters* l, UserCounters const& r);
 bool SameNames(UserCounters const& l, UserCounters const& r);
 }  // end namespace internal
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index cf1de25..11db2b9 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -68,6 +68,12 @@
   return ss.str();
 }
 
+std::string FormatKV(std::string const& key, IterationCount value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
+}
+
 std::string FormatKV(std::string const& key, double value) {
   std::stringstream ss;
   ss << '"' << StrEscape(key) << "\": ";
diff --git a/src/thread_manager.h b/src/thread_manager.h
index 6e274c7..1720281 100644
--- a/src/thread_manager.h
+++ b/src/thread_manager.h
@@ -38,7 +38,7 @@
 
  public:
   struct Result {
-    int64_t iterations = 0;
+    IterationCount iterations = 0;
     double real_time_used = 0;
     double cpu_time_used = 0;
     double manual_time_used = 0;
diff --git a/test/basic_test.cc b/test/basic_test.cc
index d07fbc0..5f3dd1a 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -98,7 +98,7 @@
 
 
 void BM_KeepRunning(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   assert(iter_count == state.iterations());
   while (state.KeepRunning()) {
     ++iter_count;
@@ -109,8 +109,8 @@
 
 void BM_KeepRunningBatch(benchmark::State& state) {
   // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const size_t batch_size = 101;
-  size_t iter_count = 0;
+  const benchmark::IterationCount batch_size = 101;
+  benchmark::IterationCount iter_count = 0;
   while (state.KeepRunningBatch(batch_size)) {
     iter_count += batch_size;
   }
@@ -119,7 +119,7 @@
 BENCHMARK(BM_KeepRunningBatch);
 
 void BM_RangedFor(benchmark::State& state) {
-  size_t iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
   for (auto _ : state) {
     ++iter_count;
   }
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 4a62869..d4febbb 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -66,9 +66,9 @@
 }
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
 BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int64_t) {
-  return 1.0;
-});
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->Complexity([](benchmark::IterationCount) { return 1.0; });
 
 const char *one_test_name = "BM_Complexity_O1";
 const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
@@ -121,7 +121,9 @@
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) -> double { return static_cast<double>(n); });
+    ->Complexity([](benchmark::IterationCount n) -> double {
+      return static_cast<double>(n);
+    });
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
@@ -160,7 +162,7 @@
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) {
+    ->Complexity([](benchmark::IterationCount n) {
       return kLog2E * n * log(static_cast<double>(n));
     });
 BENCHMARK(BM_Complexity_O_N_log_N)
diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc
index baa9ed9..c4c9a52 100644
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@@ -14,7 +14,7 @@
 
 void BM_empty(benchmark::State& state) {
   while (state.KeepRunning()) {
-    volatile std::size_t x = state.iterations();
+    volatile benchmark::IterationCount x = state.iterations();
     ((void)x);
   }
 }
diff --git a/test/state_assembly_test.cc b/test/state_assembly_test.cc
index abe9a4d..7ddbb3b 100644
--- a/test/state_assembly_test.cc
+++ b/test/state_assembly_test.cc
@@ -25,7 +25,7 @@
   for (auto _ : S) {
     // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]:
     // CHECK-GNU-NEXT: subq $1, %rbx
-    // CHECK-CLANG-NEXT: {{(addq \$1,|incq)}} %rax
+    // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}}
     // CHECK-NEXT: jne .L[[LOOP_HEAD]]
     benchmark::DoNotOptimize(x);
   }