bts classifier

This commit is contained in:
Patrick Brosi 2022-01-13 21:41:52 +01:00
parent 749044ce97
commit 516f04b9c5
9 changed files with 219 additions and 13 deletions

View file

@ -6,17 +6,19 @@
#define UTIL_STRING_H_
#include <algorithm>
#include <bitset>
#include <cassert>
#include <cmath>
#include <codecvt>
#include <cstring>
#include <exception>
#include <iomanip>
#include <iostream>
#include <locale>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#include <set>
namespace util {
@ -273,6 +275,12 @@ inline std::string implode(Iter begin, const Iter& end, const char* del) {
return ss.str();
}
// _____________________________________________________________________________
template <class T>
inline std::string implode(const std::vector<T>& vec, const char* del) {
return implode(vec.begin(), vec.end(), del);
}
// _____________________________________________________________________________
inline std::string normalizeWhiteSpace(const std::string& input) {
std::string ret;
@ -324,8 +332,9 @@ inline std::vector<std::string> tokenize(const std::string& str) {
}
// _____________________________________________________________________________
inline double jaccardSimi(const std::string& a,
const std::string& b) {
inline double jaccardSimi(const std::string& a, const std::string& b) {
if (a == b) return 1;
std::set<std::string> sa, sb;
auto toksA = tokenize(a);
@ -349,9 +358,83 @@ inline double jaccardSimi(const std::string& a,
}
// _____________________________________________________________________________
template <class T>
inline std::string implode(const std::vector<T>& vec, const char* del) {
return implode(vec.begin(), vec.end(), del);
inline double btsSimiInner(const std::vector<std::string>& toks,
const std::string& b, double best) {
std::set<std::string> toksSet;
toksSet.insert(toks.begin(), toks.end());
std::vector<std::string> toksUniqSorted;
toksUniqSorted.insert(toksUniqSorted.begin(), toksSet.begin(), toksSet.end());
assert(toksUniqSorted.size() <= 8);
for (uint8_t v = 1; v <= pow(2, toksUniqSorted.size()); v++) {
std::bitset<8> bs(v);
std::vector<std::string> cur(bs.count());
size_t i = 0;
for (size_t j = 0; j < toksUniqSorted.size(); j++) {
if (bs[j]) {
cur[i] = toksUniqSorted[j];
i++;
}
}
double tmp = util::implode(cur, " ").size();
// ed between the two string will always be at least their length
// difference - if this is already too big, skip it right now
double dt = 1 - (fabs(tmp - b.size()) * 1.0) / (fmax(tmp, b.size()) * 1.0);
if (dt <= best) continue;
// cur is guaranteed to be sorted now
do {
const auto& comb = util::implode(cur, " ");
double d =
1 - ((editDist(comb, b) * 1.0) / (fmax(comb.size(), b.size()) * 1.0));
if (fabs(d - 1) < 0.0001) return 1;
if (d > best) best = d;
} while (std::next_permutation(cur.begin(), cur.end()));
}
return best;
}
// _____________________________________________________________________________
inline double btsSimi(std::string a, std::string b) {
// this follows the implementation for the station similarity paper in
// https://github.com/ad-freiburg/statsimi/
if (a == b) return 1;
std::set<std::string> sa, sb;
auto toksA = tokenize(a);
auto toksB = tokenize(b);
// fallback to jaccard if the token set is too large
if (toksA.size() > 6 || toksB.size() > 6) {
return jaccardSimi(a, b);
}
if (toksA.size() > toksB.size()) {
std::swap(a, b);
std::swap(toksA, toksB);
}
// this is already our best known value - simply the edit
// distance similarity between the strings
double best = 1 - (editDist(a, b) * 1.0) / std::fmax(a.size(), b.size());
if (fabs(best) < 0.0001) return 0;
best = btsSimiInner(toksA, b, best);
if (fabs(best - 1) < 0.0001) return 1;
return btsSimiInner(toksB, a, best);
}
} // namespace util

View file

@ -55,6 +55,28 @@ int main(int argc, char** argv) {
{0, 5}}, 0.1) == approx(5));
}
// ___________________________________________________________________________
{
TEST(util::btsSimi("", ""), ==, approx(1));
TEST(util::btsSimi("Hallo", "Test"), ==, approx(0));
TEST(util::btsSimi("Test", "Hallo"), ==, approx(0));
TEST(util::btsSimi("Test", "Test"), ==, approx(1));
TEST(util::btsSimi("Milner Road / Wandlee Road", "Wandlee Road"), ==, approx(1));
TEST(util::btsSimi("bla blubb blob", "blubb blib"), ==, approx(0.9));
TEST(util::btsSimi("St Pancras International", "London St Pancras"), ==, approx(0.588235));
TEST(util::btsSimi("Reiterstraße", "Reiterstraße Freiburg im Breisgau"), ==, approx(1));
TEST(util::btsSimi("Reiterstraße", "Reiter Freiburg im Breisgau"), ==, approx(.466666666));
TEST(util::btsSimi("AA", "Reiterstraße, Freiburg im Breisgau"), ==, approx(0));
TEST(util::btsSimi("blibb blabbel bla blubb blob", "blubb blib blabb"), ==, approx(0.875));
TEST(util::btsSimi("blibb blabbel bla blubb blobo", "blubb blib blabb blabo"), ==, approx(0.84));
TEST(util::btsSimi("blubb blib blabb", "blibb blabbel bla blubb blob"), ==, approx(0.875));
TEST(util::btsSimi("blubbb blib blabb blobo", "blibb blabbel bla blubb blobo"), ==, approx(0.84));
TEST(util::btsSimi("Reiter Freiburg im Breisgau", "Reiter Frei burg im Brei sgau"), ==, approx(0.931034));
// fallback to jaccard
TEST(util::btsSimi("Freiburg im Breisgau, Germany, Main Railway Station", "Main Railway Station Freiburg im Breisgau, Germany"), ==, approx(1));
}
// ___________________________________________________________________________
{
std::string test = u8"Zürich, Hauptbahnhof (Nord)";