blob: a63299ee816642316f070adf49c90b393dadb61e [file] [log] [blame]
// Copyright (c) 2016, Istvan Soos. All rights reserved. Use of this source code
// is governed by a BSD-style license that can be found in the LICENSE file.
import './base.dart';
/// The Jaccard coefficient measures similarity between finite sample sets, and
/// is defined as the size of the intersection divided by the size of the union
/// of the sample sets.
///
/// The strings are divided into N-grams (N-character sequences), and the
/// Jaccard distance is calculated based on these sets.
/// https://en.wikipedia.org/wiki/Jaccard_index
class Jaccard implements StringDistance, NormalizedStringDistance {
/// The size of the N-character sequences.
final int ngram;
/// Whether to introduce extra padding before and after the text.
final bool usePadding;
Jaccard({this.ngram = 2, this.usePadding = false});
@override
int distance(String s1, String s2) {
if (usePadding) {
s1 = _pad(s1);
s2 = _pad(s2);
}
Set<String> set1 = _split(s1);
Set<String> set2 = _split(s2);
int intersection = set1.where((item) => set2.contains(item)).length;
int union = set1.length + set2.length - intersection;
return union - intersection;
}
@override
double normalizedDistance(String s1, String s2) {
if (usePadding) {
s1 = _pad(s1);
s2 = _pad(s2);
}
Set<String> set1 = _split(s1);
Set<String> set2 = _split(s2);
int intersection = set1.where((item) => set2.contains(item)).length;
int union = set1.length + set2.length - intersection;
return (union - intersection) / union;
}
String _pad(String s) {
String pad = ''.padLeft(ngram - 1);
return '$pad${s.trim()}$pad';
}
Set<String> _split(String s) {
Set<String> set = new Set();
if (s.length <= ngram) {
set.add(s);
} else {
for (int i = 0; i <= s.length - ngram; i++) {
set.add(s.substring(i, i + ngram));
}
}
return set;
}
}
/// Combines multiple Jaccard normalized edit distance of N=1, N=2, N=3, ...
/// The individual distances are weighed by N^2.
class CombinedJaccard implements NormalizedStringDistance {
List<Jaccard> _list;
List<int> _weights;
int _sumWeights = 0;
CombinedJaccard({int ngram = 5, bool usePadding = false}) {
_list = new List.generate(
ngram, (i) => new Jaccard(ngram: i + 1, usePadding: usePadding));
_weights = new List.generate(ngram, (i) => (i + 1) * (i + 1));
_sumWeights = _weights.fold(0, (a, b) => a + b);
}
@override
double normalizedDistance(String s1, String s2) {
double distance = 0.0;
for (int i = 0; i < _list.length; i++) {
distance += _list[i].normalizedDistance(s1, s2) * _weights[i];
}
return distance / _sumWeights;
}
}