bts classifier
This commit is contained in:
parent
749044ce97
commit
516f04b9c5
9 changed files with 219 additions and 13 deletions
|
|
@ -6,17 +6,19 @@
|
|||
#define UTIL_STRING_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <bitset>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <codecvt>
|
||||
#include <cstring>
|
||||
#include <exception>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
namespace util {
|
||||
|
||||
|
|
@ -273,6 +275,12 @@ inline std::string implode(Iter begin, const Iter& end, const char* del) {
|
|||
return ss.str();
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
template <class T>
|
||||
inline std::string implode(const std::vector<T>& vec, const char* del) {
|
||||
return implode(vec.begin(), vec.end(), del);
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
inline std::string normalizeWhiteSpace(const std::string& input) {
|
||||
std::string ret;
|
||||
|
|
@ -324,8 +332,9 @@ inline std::vector<std::string> tokenize(const std::string& str) {
|
|||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
inline double jaccardSimi(const std::string& a,
|
||||
const std::string& b) {
|
||||
inline double jaccardSimi(const std::string& a, const std::string& b) {
|
||||
if (a == b) return 1;
|
||||
|
||||
std::set<std::string> sa, sb;
|
||||
|
||||
auto toksA = tokenize(a);
|
||||
|
|
@ -349,9 +358,83 @@ inline double jaccardSimi(const std::string& a,
|
|||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
template <class T>
|
||||
inline std::string implode(const std::vector<T>& vec, const char* del) {
|
||||
return implode(vec.begin(), vec.end(), del);
|
||||
inline double btsSimiInner(const std::vector<std::string>& toks,
|
||||
const std::string& b, double best) {
|
||||
std::set<std::string> toksSet;
|
||||
toksSet.insert(toks.begin(), toks.end());
|
||||
std::vector<std::string> toksUniqSorted;
|
||||
toksUniqSorted.insert(toksUniqSorted.begin(), toksSet.begin(), toksSet.end());
|
||||
|
||||
assert(toksUniqSorted.size() <= 8);
|
||||
|
||||
for (uint8_t v = 1; v <= pow(2, toksUniqSorted.size()); v++) {
|
||||
std::bitset<8> bs(v);
|
||||
std::vector<std::string> cur(bs.count());
|
||||
|
||||
size_t i = 0;
|
||||
for (size_t j = 0; j < toksUniqSorted.size(); j++) {
|
||||
if (bs[j]) {
|
||||
cur[i] = toksUniqSorted[j];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
double tmp = util::implode(cur, " ").size();
|
||||
|
||||
// ed between the two string will always be at least their length
|
||||
// difference - if this is already too big, skip it right now
|
||||
double dt = 1 - (fabs(tmp - b.size()) * 1.0) / (fmax(tmp, b.size()) * 1.0);
|
||||
|
||||
if (dt <= best) continue;
|
||||
|
||||
// cur is guaranteed to be sorted now
|
||||
do {
|
||||
const auto& comb = util::implode(cur, " ");
|
||||
|
||||
double d =
|
||||
1 - ((editDist(comb, b) * 1.0) / (fmax(comb.size(), b.size()) * 1.0));
|
||||
|
||||
if (fabs(d - 1) < 0.0001) return 1;
|
||||
|
||||
if (d > best) best = d;
|
||||
} while (std::next_permutation(cur.begin(), cur.end()));
|
||||
}
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
inline double btsSimi(std::string a, std::string b) {
|
||||
// this follows the implementation for the station similarity paper in
|
||||
// https://github.com/ad-freiburg/statsimi/
|
||||
if (a == b) return 1;
|
||||
|
||||
std::set<std::string> sa, sb;
|
||||
|
||||
auto toksA = tokenize(a);
|
||||
auto toksB = tokenize(b);
|
||||
|
||||
// fallback to jaccard if the token set is too large
|
||||
if (toksA.size() > 6 || toksB.size() > 6) {
|
||||
return jaccardSimi(a, b);
|
||||
}
|
||||
|
||||
if (toksA.size() > toksB.size()) {
|
||||
std::swap(a, b);
|
||||
std::swap(toksA, toksB);
|
||||
}
|
||||
|
||||
// this is already our best known value - simply the edit
|
||||
// distance similarity between the strings
|
||||
double best = 1 - (editDist(a, b) * 1.0) / std::fmax(a.size(), b.size());
|
||||
|
||||
if (fabs(best) < 0.0001) return 0;
|
||||
|
||||
best = btsSimiInner(toksA, b, best);
|
||||
|
||||
if (fabs(best - 1) < 0.0001) return 1;
|
||||
|
||||
return btsSimiInner(toksB, a, best);
|
||||
}
|
||||
} // namespace util
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,28 @@ int main(int argc, char** argv) {
|
|||
{0, 5}}, 0.1) == approx(5));
|
||||
}
|
||||
|
||||
// ___________________________________________________________________________
|
||||
{
|
||||
TEST(util::btsSimi("", ""), ==, approx(1));
|
||||
TEST(util::btsSimi("Hallo", "Test"), ==, approx(0));
|
||||
TEST(util::btsSimi("Test", "Hallo"), ==, approx(0));
|
||||
TEST(util::btsSimi("Test", "Test"), ==, approx(1));
|
||||
TEST(util::btsSimi("Milner Road / Wandlee Road", "Wandlee Road"), ==, approx(1));
|
||||
TEST(util::btsSimi("bla blubb blob", "blubb blib"), ==, approx(0.9));
|
||||
TEST(util::btsSimi("St Pancras International", "London St Pancras"), ==, approx(0.588235));
|
||||
TEST(util::btsSimi("Reiterstraße", "Reiterstraße Freiburg im Breisgau"), ==, approx(1));
|
||||
TEST(util::btsSimi("Reiterstraße", "Reiter Freiburg im Breisgau"), ==, approx(.466666666));
|
||||
TEST(util::btsSimi("AA", "Reiterstraße, Freiburg im Breisgau"), ==, approx(0));
|
||||
TEST(util::btsSimi("blibb blabbel bla blubb blob", "blubb blib blabb"), ==, approx(0.875));
|
||||
TEST(util::btsSimi("blibb blabbel bla blubb blobo", "blubb blib blabb blabo"), ==, approx(0.84));
|
||||
TEST(util::btsSimi("blubb blib blabb", "blibb blabbel bla blubb blob"), ==, approx(0.875));
|
||||
TEST(util::btsSimi("blubbb blib blabb blobo", "blibb blabbel bla blubb blobo"), ==, approx(0.84));
|
||||
TEST(util::btsSimi("Reiter Freiburg im Breisgau", "Reiter Frei burg im Brei sgau"), ==, approx(0.931034));
|
||||
// fallback to jaccard
|
||||
TEST(util::btsSimi("Freiburg im Breisgau, Germany, Main Railway Station", "Main Railway Station Freiburg im Breisgau, Germany"), ==, approx(1));
|
||||
|
||||
}
|
||||
|
||||
// ___________________________________________________________________________
|
||||
{
|
||||
std::string test = u8"Zürich, Hauptbahnhof (Nord)";
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue