blob: e49c34e00616bd83c202eef194ab56387cd001d3 [file] [log] [blame]
/*
* Copyright 2011 Avery Pennarun. All rights reserved.
*
* (This license applies to bupsplit.c and bupsplit.h only.)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY AVERY PENNARUN ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef PERIDOT_BIN_LEDGER_THIRD_PARTY_BUP_BUPSPLIT_H_
#define PERIDOT_BIN_LEDGER_THIRD_PARTY_BUP_BUPSPLIT_H_
#include <stdint.h>
#include <lib/fxl/macros.h>
#include <lib/fxl/strings/string_view.h>
namespace bup {
constexpr uint8_t kBlobBits = 13;
constexpr uint16_t kBlobSize = 1 << kBlobBits;
constexpr uint8_t kWindowBits = 7;
constexpr uint16_t kWindowSize = 1 << (kWindowBits - 1);
// Splits data into chunks between |min_length| and |max_length| of size that
// are "good" for de-duplication.
//
// It achieves that by calculating a rolling hash of the window of
// |kWindowBits|. The split points are selected when the last |kBlobBits| of the
// current hash are all 1s. This ensures that if data is removed or inserted in
// the middle of data, only the 2 split points following the change are
// modified, and all others stay identical.
class RollSumSplit {
public:
// |min_length| is the minimal size of a chunk.
// |max_length| is the maximal size of a chunk.
RollSumSplit(size_t min_length, size_t max_length);
// Copy constructor.
RollSumSplit(const RollSumSplit& other);
// Reset the state of the rolling hash.
void Reset();
// Returns a non-zero value indicating the size of the prefix that is the next
// cut, or 0 if all data was consumed without finding the next cut.
// If |bits| is not null, and a cut has been found, |*bits| will be the number
// of trailing 1s in the current hash. It will always be greater of equals to
// |kBlobBits|.
size_t Feed(fxl::StringView buffer, size_t* bits);
private:
void Add(uint8_t drop, uint8_t add);
void Roll(uint8_t c);
uint32_t Digest();
const size_t min_length_;
const size_t max_length_;
size_t current_length_;
uint64_t s1_, s2_;
uint8_t window_[kWindowSize];
size_t window_index_;
};
} // namespace bup
#endif // PERIDOT_BIN_LEDGER_THIRD_PARTY_BUP_BUPSPLIT_H_