blob: 972c5a0b0203d608135f46d243b7643722b674b0 [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "algorithms/rappor/bloom_bit_counter.h"
#include <glog/logging.h>
#include <cmath>
#include <utility>
#include <vector>
#include "util/log_based_metrics.h"
namespace cobalt {
namespace rappor {
// Stackdriver metric constants
namespace {
const char kBloomBitCounterConstructorFailure[] =
"bloom-bit-counter-constructor-failure";
const char kAddObservationFailure[] =
"bloom-bin-counter-add-observation-failure";
} // namespace
BloomBitCounter::BloomBitCounter(const RapporConfig& config)
: config_(new RapporConfigValidator(config)), num_bloom_bytes_(0) {
if (!config_->valid()) {
LOG_STACKDRIVER_COUNT_METRIC(ERROR, kBloomBitCounterConstructorFailure)
<< "RapporConfig is invalid";
return;
}
estimated_bloom_counts_.reserve(config_->num_cohorts());
const auto num_bits = config_->num_bits();
for (size_t cohort = 0; cohort < config_->num_cohorts(); cohort++) {
estimated_bloom_counts_.emplace_back(cohort, num_bits);
}
num_bloom_bytes_ = (num_bits + 7) / 8;
}
bool BloomBitCounter::AddObservation(const RapporObservation& obs) {
if (!config_->valid()) {
LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAddObservationFailure)
<< "RapporConfig is invalid";
observation_errors_++;
return false;
}
if (obs.data().size() != num_bloom_bytes_) {
LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAddObservationFailure)
<< "RapporObservation has the wrong number of bytes: "
<< obs.data().size() << ". Expecting " << num_bloom_bytes_;
observation_errors_++;
return false;
}
auto cohort = obs.cohort();
if (cohort >= config_->num_cohorts()) {
LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAddObservationFailure)
<< "RapporObservation has an invalid cohort index: " << cohort
<< ". num_cohorts= " << config_->num_cohorts();
observation_errors_++;
return false;
}
// We have a good observation.
num_observations_++;
estimated_bloom_counts_[cohort].num_observations++;
// We iterate through the bits of the observation "from right to left",
// i.e. from the least-significant bit of the last byte to the
// most-significant bit of the first byte. If the ith bit is set we increment
// bit_sums[i] for the appropriate cohort.
//
// NOTE(rudominer) Possible performance optimizations: Consider using x86
// vector operations or the find-first-bit-set instruction or simply checking
// for zero bytes.
std::vector<size_t>& bit_sums = estimated_bloom_counts_[cohort].bit_sums;
size_t bit_index = 0;
for (int byte_index = num_bloom_bytes_ - 1; byte_index >= 0; byte_index--) {
uint8_t bit_mask = 1;
for (int bit_in_byte_index = 0; bit_in_byte_index < 8;
bit_in_byte_index++) {
if (bit_index >= bit_sums.size()) {
return true;
}
if (bit_mask & obs.data()[byte_index]) {
bit_sums[bit_index]++;
}
bit_index++;
bit_mask <<= 1;
}
}
return true;
}
const std::vector<CohortCounts>& BloomBitCounter::EstimateCounts() {
double q = config_->prob_1_stays_1();
double p = config_->prob_0_becomes_1();
double one_minus_q_plus_p = 1.0 - (q + p);
double divisor = q - p; // divisor != 0 because we don't allow q == p.
double abs_divisor = std::abs(divisor);
// Compute the estimated counts and std_error for each cohort.
for (auto& cohort_counts : estimated_bloom_counts_) {
double N = cohort_counts.num_observations;
double Npq = N * p * q;
double correction = p * N;
// Note(rudominer) When we support PRR then we need to modify the above
// formulas as follows. Let f = prob_rr. Then let
// p11 = q * (1 - f/2) + p * f / 2;
// p01 = p * (1 - f/2) + q * f / 2;
// correction = p01 * N;
// divisor = p11 - p01; // (1 - f) * (q - p)
std::vector<size_t>& bit_sums = cohort_counts.bit_sums;
std::vector<double>& count_estimates = cohort_counts.count_estimates;
std::vector<double>& std_errors = cohort_counts.std_errors;
count_estimates.resize(bit_sums.size());
std_errors.resize(bit_sums.size());
// Compute the estimate count and std_error for each bloom bit for the
// current cohort.
for (size_t bit_index = 0; bit_index < bit_sums.size(); bit_index++) {
double Y = bit_sums[bit_index];
// See go/cobalt-basic-rappor-analysis for an explanation of the
// formulas we use for count_estimate and std_error.
count_estimates[bit_index] = (Y - correction) / divisor;
std_errors[bit_index] = sqrt(Y * one_minus_q_plus_p + Npq) / abs_divisor;
}
}
return estimated_bloom_counts_;
}
} // namespace rappor
} // namespace cobalt