From 516f04b9c5242e18eacbd7f6bf299240a216f681 Mon Sep 17 00:00:00 2001 From: Patrick Brosi Date: Thu, 13 Jan 2022 21:41:52 +0100 Subject: [PATCH] bts classifier --- pfaedle.cfg | 2 + src/pfaedle/PfaedleMain.cpp | 23 ++++- src/pfaedle/config/MotConfigReader.cpp | 7 ++ src/pfaedle/router/Misc.h | 2 + src/pfaedle/router/ShapeBuilder.cpp | 1 - .../StatsimiClassifier.cpp | 55 ++++++++++- .../statsimi-classifier/StatsimiClassifier.h | 25 +++++ src/util/String.h | 95 +++++++++++++++++-- src/util/tests/TestMain.cpp | 22 +++++ 9 files changed, 219 insertions(+), 13 deletions(-) diff --git a/pfaedle.cfg b/pfaedle.cfg index 711bc6d..8a8d952 100644 --- a/pfaedle.cfg +++ b/pfaedle.cfg @@ -7,6 +7,8 @@ routing_transition_penalty_fac: 0.0083 routing_station_move_penalty_fac: 0.002 +station_similarity_classification_method: jaccard + # Regular expressions and station comparision is # always case insensitive! station_normalize_chain: diff --git a/src/pfaedle/PfaedleMain.cpp b/src/pfaedle/PfaedleMain.cpp index 4b4b872..ecba826 100644 --- a/src/pfaedle/PfaedleMain.cpp +++ b/src/pfaedle/PfaedleMain.cpp @@ -61,6 +61,10 @@ using pfaedle::router::RouterImpl; using pfaedle::router::ShapeBuilder; using pfaedle::router::Stats; using pfaedle::statsimiclassifier::JaccardClassifier; +using pfaedle::statsimiclassifier::StatsimiClassifier; +using pfaedle::statsimiclassifier::BTSClassifier; +using pfaedle::statsimiclassifier::EDClassifier; +using pfaedle::statsimiclassifier::PEDClassifier; enum class RetCode { SUCCESS = 0, @@ -277,7 +281,21 @@ int main(int argc, char** argv) { graphDimensions[filePost].second += nd->getAdjListOut().size(); } - JaccardClassifier statsimiClassifier; + StatsimiClassifier* statsimiClassifier; + + if (motCfg.routingOpts.statsimiMethod == "bts") { + statsimiClassifier = new BTSClassifier(); + } else if (motCfg.routingOpts.statsimiMethod == "jaccard") { + statsimiClassifier = new JaccardClassifier(); + } else if (motCfg.routingOpts.statsimiMethod == "ed") { + statsimiClassifier = new EDClassifier(); + } else if (motCfg.routingOpts.statsimiMethod == "ped") { + statsimiClassifier = new PEDClassifier(); + } else { + LOG(ERROR) << "Unknown station similarity classifier " + << motCfg.routingOpts.statsimiMethod; + exit(1); + } Router* router = 0; @@ -303,7 +321,7 @@ int main(int argc, char** argv) { } ShapeBuilder shapeBuilder(>fs[0], usedMots, motCfg, &graph, &fStops, - &restr, &statsimiClassifier, router, cfg); + &restr, statsimiClassifier, router, cfg); pfaedle::netgraph::Graph ng; @@ -326,6 +344,7 @@ int main(int argc, char** argv) { } if (router) delete router; + if (statsimiClassifier) delete statsimiClassifier; if (cfg.writeGraph) { LOG(INFO) << "Outputting graph.json..."; diff --git a/src/pfaedle/config/MotConfigReader.cpp b/src/pfaedle/config/MotConfigReader.cpp index e9f3905..7ad16ff 100644 --- a/src/pfaedle/config/MotConfigReader.cpp +++ b/src/pfaedle/config/MotConfigReader.cpp @@ -61,6 +61,13 @@ void MotConfigReader::parse(const std::vector& paths, cfg.routingOpts.transPenMethod = "exp"; } + if (p.hasKey(secStr, "station_similarity_classification_method")) { + cfg.routingOpts.statsimiMethod = + p.getStr(secStr, "station_similarity_classification_method"); + } else { + cfg.routingOpts.statsimiMethod = "bts"; + } + if (p.hasKey(secStr, "routing_use_stations")) { cfg.routingOpts.useStations = p.getBool(secStr, "routing_use_stations"); } else { diff --git a/src/pfaedle/router/Misc.h b/src/pfaedle/router/Misc.h index d96cb34..34b1cfe 100644 --- a/src/pfaedle/router/Misc.h +++ b/src/pfaedle/router/Misc.h @@ -62,6 +62,7 @@ struct RoutingOpts { double transitionPen; std::string transPenMethod; std::string emPenMethod; + std::string statsimiMethod; }; // _____________________________________________________________________________ @@ -82,6 +83,7 @@ inline bool operator==(const RoutingOpts& a, const RoutingOpts& b) { fabs(a.nonStationPen - b.nonStationPen) < 0.01 && a.transPenMethod == b.transPenMethod && a.emPenMethod == b.emPenMethod && + a.statsimiMethod == b.statsimiMethod && a.useStations == b.useStations && a.popReachEdge == b.popReachEdge && a.noSelfHops == b.noSelfHops; } diff --git a/src/pfaedle/router/ShapeBuilder.cpp b/src/pfaedle/router/ShapeBuilder.cpp index abd1af5..57e4bca 100644 --- a/src/pfaedle/router/ShapeBuilder.cpp +++ b/src/pfaedle/router/ShapeBuilder.cpp @@ -47,7 +47,6 @@ using pfaedle::router::ShapeBuilder; using pfaedle::router::Stats; using pfaedle::router::TripForests; using pfaedle::router::TripTrie; -using pfaedle::statsimiclassifier::JaccardClassifier; using pfaedle::trgraph::EdgeGrid; using pfaedle::trgraph::NodeGrid; using util::geo::latLngToWebMerc; diff --git a/src/pfaedle/statsimi-classifier/StatsimiClassifier.cpp b/src/pfaedle/statsimi-classifier/StatsimiClassifier.cpp index f354311..7b98532 100644 --- a/src/pfaedle/statsimi-classifier/StatsimiClassifier.cpp +++ b/src/pfaedle/statsimi-classifier/StatsimiClassifier.cpp @@ -9,7 +9,10 @@ #include "pfaedle/statsimi-classifier/StatsimiClassifier.h" #include "util/geo/Geo.h" +using pfaedle::statsimiclassifier::BTSClassifier; +using pfaedle::statsimiclassifier::EDClassifier; using pfaedle::statsimiclassifier::JaccardClassifier; +using pfaedle::statsimiclassifier::PEDClassifier; // _____________________________________________________________________________ bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA, @@ -23,8 +26,52 @@ bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA, // _____________________________________________________________________________ bool JaccardClassifier::similar(const std::string& nameA, const std::string& nameB) const { - // hard similarity - if (nameA == nameB) return true; - - return util::jaccardSimi(nameA, nameB) > 0.45; // 0.45 from paper + return util::jaccardSimi(nameA, nameB) > 0.45; // 0.45 from statsimi paper +} + +// _____________________________________________________________________________ +bool BTSClassifier::similar(const std::string& nameA, const POINT& posA, + const std::string& nameB, const POINT& posB) const { + UNUSED(posA); + UNUSED(posB); + return similar(nameA, nameB); +} + +// _____________________________________________________________________________ +bool BTSClassifier::similar(const std::string& nameA, + const std::string& nameB) const { + return util::btsSimi(nameA, nameB) > 0.85; // 0.85 from statsimi paper +} + +// _____________________________________________________________________________ +bool EDClassifier::similar(const std::string& nameA, const POINT& posA, + const std::string& nameB, const POINT& posB) const { + UNUSED(posA); + UNUSED(posB); + return similar(nameA, nameB); +} + +// _____________________________________________________________________________ +bool EDClassifier::similar(const std::string& nameA, + const std::string& nameB) const { + double edSimi = 1.0 - ((util::editDist(nameA, nameB) * 1.0) / + fmax(nameA.size(), nameB.size())); + return edSimi > 0.85; // 0.85 from statsimi paper +} + +// _____________________________________________________________________________ +bool PEDClassifier::similar(const std::string& nameA, const POINT& posA, + const std::string& nameB, const POINT& posB) const { + UNUSED(posA); + UNUSED(posB); + return similar(nameA, nameB); +} + +// _____________________________________________________________________________ +bool PEDClassifier::similar(const std::string& nameA, + const std::string& nameB) const { + double a = (util::prefixEditDist(nameA, nameB) * 1.0) / (nameA.size() * 1.0); + double b = (util::prefixEditDist(nameB, nameA) * 1.0) / (nameB.size() * 1.0); + double pedSimi = 1.0 - fmin(a, b); + return pedSimi > 0.875; // 0.875 average of values from statsimi paper } diff --git a/src/pfaedle/statsimi-classifier/StatsimiClassifier.h b/src/pfaedle/statsimi-classifier/StatsimiClassifier.h index dac508b..7890ff3 100644 --- a/src/pfaedle/statsimi-classifier/StatsimiClassifier.h +++ b/src/pfaedle/statsimi-classifier/StatsimiClassifier.h @@ -14,6 +14,7 @@ namespace statsimiclassifier { class StatsimiClassifier { public: + virtual ~StatsimiClassifier() {} virtual bool similar(const std::string& nameA, const POINT& posA, const std::string& nameB, const POINT& posB) const = 0; @@ -29,6 +30,30 @@ class JaccardClassifier : public StatsimiClassifier { const std::string& nameB) const; }; +class BTSClassifier : public StatsimiClassifier { + public: + virtual bool similar(const std::string& nameA, const POINT& posA, + const std::string& nameB, const POINT& posB) const; + virtual bool similar(const std::string& nameA, + const std::string& nameB) const; +}; + +class EDClassifier : public StatsimiClassifier { + public: + virtual bool similar(const std::string& nameA, const POINT& posA, + const std::string& nameB, const POINT& posB) const; + virtual bool similar(const std::string& nameA, + const std::string& nameB) const; +}; + +class PEDClassifier : public StatsimiClassifier { + public: + virtual bool similar(const std::string& nameA, const POINT& posA, + const std::string& nameB, const POINT& posB) const; + virtual bool similar(const std::string& nameA, + const std::string& nameB) const; +}; + } // namespace statsimiclassifier } // namespace pfaedle diff --git a/src/util/String.h b/src/util/String.h index 379785f..0bd30d9 100644 --- a/src/util/String.h +++ b/src/util/String.h @@ -6,17 +6,19 @@ #define UTIL_STRING_H_ #include +#include #include +#include #include #include #include #include #include #include +#include #include #include #include -#include namespace util { @@ -273,6 +275,12 @@ inline std::string implode(Iter begin, const Iter& end, const char* del) { return ss.str(); } +// _____________________________________________________________________________ +template +inline std::string implode(const std::vector& vec, const char* del) { + return implode(vec.begin(), vec.end(), del); +} + // _____________________________________________________________________________ inline std::string normalizeWhiteSpace(const std::string& input) { std::string ret; @@ -324,8 +332,9 @@ inline std::vector tokenize(const std::string& str) { } // _____________________________________________________________________________ -inline double jaccardSimi(const std::string& a, - const std::string& b) { +inline double jaccardSimi(const std::string& a, const std::string& b) { + if (a == b) return 1; + std::set sa, sb; auto toksA = tokenize(a); @@ -349,9 +358,83 @@ inline double jaccardSimi(const std::string& a, } // _____________________________________________________________________________ -template -inline std::string implode(const std::vector& vec, const char* del) { - return implode(vec.begin(), vec.end(), del); +inline double btsSimiInner(const std::vector& toks, + const std::string& b, double best) { + std::set toksSet; + toksSet.insert(toks.begin(), toks.end()); + std::vector toksUniqSorted; + toksUniqSorted.insert(toksUniqSorted.begin(), toksSet.begin(), toksSet.end()); + + assert(toksUniqSorted.size() <= 8); + + for (uint8_t v = 1; v <= pow(2, toksUniqSorted.size()); v++) { + std::bitset<8> bs(v); + std::vector cur(bs.count()); + + size_t i = 0; + for (size_t j = 0; j < toksUniqSorted.size(); j++) { + if (bs[j]) { + cur[i] = toksUniqSorted[j]; + i++; + } + } + + double tmp = util::implode(cur, " ").size(); + + // ed between the two string will always be at least their length + // difference - if this is already too big, skip it right now + double dt = 1 - (fabs(tmp - b.size()) * 1.0) / (fmax(tmp, b.size()) * 1.0); + + if (dt <= best) continue; + + // cur is guaranteed to be sorted now + do { + const auto& comb = util::implode(cur, " "); + + double d = + 1 - ((editDist(comb, b) * 1.0) / (fmax(comb.size(), b.size()) * 1.0)); + + if (fabs(d - 1) < 0.0001) return 1; + + if (d > best) best = d; + } while (std::next_permutation(cur.begin(), cur.end())); + } + + return best; +} + +// _____________________________________________________________________________ +inline double btsSimi(std::string a, std::string b) { + // this follows the implementation for the station similarity paper in + // https://github.com/ad-freiburg/statsimi/ + if (a == b) return 1; + + std::set sa, sb; + + auto toksA = tokenize(a); + auto toksB = tokenize(b); + + // fallback to jaccard if the token set is too large + if (toksA.size() > 6 || toksB.size() > 6) { + return jaccardSimi(a, b); + } + + if (toksA.size() > toksB.size()) { + std::swap(a, b); + std::swap(toksA, toksB); + } + + // this is already our best known value - simply the edit + // distance similarity between the strings + double best = 1 - (editDist(a, b) * 1.0) / std::fmax(a.size(), b.size()); + + if (fabs(best) < 0.0001) return 0; + + best = btsSimiInner(toksA, b, best); + + if (fabs(best - 1) < 0.0001) return 1; + + return btsSimiInner(toksB, a, best); } } // namespace util diff --git a/src/util/tests/TestMain.cpp b/src/util/tests/TestMain.cpp index 63ac3ad..97cfb1c 100644 --- a/src/util/tests/TestMain.cpp +++ b/src/util/tests/TestMain.cpp @@ -55,6 +55,28 @@ int main(int argc, char** argv) { {0, 5}}, 0.1) == approx(5)); } + // ___________________________________________________________________________ + { + TEST(util::btsSimi("", ""), ==, approx(1)); + TEST(util::btsSimi("Hallo", "Test"), ==, approx(0)); + TEST(util::btsSimi("Test", "Hallo"), ==, approx(0)); + TEST(util::btsSimi("Test", "Test"), ==, approx(1)); + TEST(util::btsSimi("Milner Road / Wandlee Road", "Wandlee Road"), ==, approx(1)); + TEST(util::btsSimi("bla blubb blob", "blubb blib"), ==, approx(0.9)); + TEST(util::btsSimi("St Pancras International", "London St Pancras"), ==, approx(0.588235)); + TEST(util::btsSimi("Reiterstraße", "Reiterstraße Freiburg im Breisgau"), ==, approx(1)); + TEST(util::btsSimi("Reiterstraße", "Reiter Freiburg im Breisgau"), ==, approx(.466666666)); + TEST(util::btsSimi("AA", "Reiterstraße, Freiburg im Breisgau"), ==, approx(0)); + TEST(util::btsSimi("blibb blabbel bla blubb blob", "blubb blib blabb"), ==, approx(0.875)); + TEST(util::btsSimi("blibb blabbel bla blubb blobo", "blubb blib blabb blabo"), ==, approx(0.84)); + TEST(util::btsSimi("blubb blib blabb", "blibb blabbel bla blubb blob"), ==, approx(0.875)); + TEST(util::btsSimi("blubbb blib blabb blobo", "blibb blabbel bla blubb blobo"), ==, approx(0.84)); + TEST(util::btsSimi("Reiter Freiburg im Breisgau", "Reiter Frei burg im Brei sgau"), ==, approx(0.931034)); + // fallback to jaccard + TEST(util::btsSimi("Freiburg im Breisgau, Germany, Main Railway Station", "Main Railway Station Freiburg im Breisgau, Germany"), ==, approx(1)); + + } + // ___________________________________________________________________________ { std::string test = u8"Zürich, Hauptbahnhof (Nord)";