bts classifier

This commit is contained in:
Patrick Brosi 2022-01-13 21:41:52 +01:00
parent 749044ce97
commit 516f04b9c5
9 changed files with 219 additions and 13 deletions

View file

@ -7,6 +7,8 @@
routing_transition_penalty_fac: 0.0083
routing_station_move_penalty_fac: 0.002
station_similarity_classification_method: jaccard
# Regular expressions and station comparision is
# always case insensitive!
station_normalize_chain:

View file

@ -61,6 +61,10 @@ using pfaedle::router::RouterImpl;
using pfaedle::router::ShapeBuilder;
using pfaedle::router::Stats;
using pfaedle::statsimiclassifier::JaccardClassifier;
using pfaedle::statsimiclassifier::StatsimiClassifier;
using pfaedle::statsimiclassifier::BTSClassifier;
using pfaedle::statsimiclassifier::EDClassifier;
using pfaedle::statsimiclassifier::PEDClassifier;
enum class RetCode {
SUCCESS = 0,
@ -277,7 +281,21 @@ int main(int argc, char** argv) {
graphDimensions[filePost].second += nd->getAdjListOut().size();
}
JaccardClassifier statsimiClassifier;
StatsimiClassifier* statsimiClassifier;
if (motCfg.routingOpts.statsimiMethod == "bts") {
statsimiClassifier = new BTSClassifier();
} else if (motCfg.routingOpts.statsimiMethod == "jaccard") {
statsimiClassifier = new JaccardClassifier();
} else if (motCfg.routingOpts.statsimiMethod == "ed") {
statsimiClassifier = new EDClassifier();
} else if (motCfg.routingOpts.statsimiMethod == "ped") {
statsimiClassifier = new PEDClassifier();
} else {
LOG(ERROR) << "Unknown station similarity classifier "
<< motCfg.routingOpts.statsimiMethod;
exit(1);
}
Router* router = 0;
@ -303,7 +321,7 @@ int main(int argc, char** argv) {
}
ShapeBuilder shapeBuilder(&gtfs[0], usedMots, motCfg, &graph, &fStops,
&restr, &statsimiClassifier, router, cfg);
&restr, statsimiClassifier, router, cfg);
pfaedle::netgraph::Graph ng;
@ -326,6 +344,7 @@ int main(int argc, char** argv) {
}
if (router) delete router;
if (statsimiClassifier) delete statsimiClassifier;
if (cfg.writeGraph) {
LOG(INFO) << "Outputting graph.json...";

View file

@ -61,6 +61,13 @@ void MotConfigReader::parse(const std::vector<std::string>& paths,
cfg.routingOpts.transPenMethod = "exp";
}
if (p.hasKey(secStr, "station_similarity_classification_method")) {
cfg.routingOpts.statsimiMethod =
p.getStr(secStr, "station_similarity_classification_method");
} else {
cfg.routingOpts.statsimiMethod = "bts";
}
if (p.hasKey(secStr, "routing_use_stations")) {
cfg.routingOpts.useStations = p.getBool(secStr, "routing_use_stations");
} else {

View file

@ -62,6 +62,7 @@ struct RoutingOpts {
double transitionPen;
std::string transPenMethod;
std::string emPenMethod;
std::string statsimiMethod;
};
// _____________________________________________________________________________
@ -82,6 +83,7 @@ inline bool operator==(const RoutingOpts& a, const RoutingOpts& b) {
fabs(a.nonStationPen - b.nonStationPen) < 0.01 &&
a.transPenMethod == b.transPenMethod &&
a.emPenMethod == b.emPenMethod &&
a.statsimiMethod == b.statsimiMethod &&
a.useStations == b.useStations && a.popReachEdge == b.popReachEdge &&
a.noSelfHops == b.noSelfHops;
}

View file

@ -47,7 +47,6 @@ using pfaedle::router::ShapeBuilder;
using pfaedle::router::Stats;
using pfaedle::router::TripForests;
using pfaedle::router::TripTrie;
using pfaedle::statsimiclassifier::JaccardClassifier;
using pfaedle::trgraph::EdgeGrid;
using pfaedle::trgraph::NodeGrid;
using util::geo::latLngToWebMerc;

View file

@ -9,7 +9,10 @@
#include "pfaedle/statsimi-classifier/StatsimiClassifier.h"
#include "util/geo/Geo.h"
using pfaedle::statsimiclassifier::BTSClassifier;
using pfaedle::statsimiclassifier::EDClassifier;
using pfaedle::statsimiclassifier::JaccardClassifier;
using pfaedle::statsimiclassifier::PEDClassifier;
// _____________________________________________________________________________
bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA,
@ -23,8 +26,52 @@ bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA,
// _____________________________________________________________________________
bool JaccardClassifier::similar(const std::string& nameA,
const std::string& nameB) const {
// hard similarity
if (nameA == nameB) return true;
return util::jaccardSimi(nameA, nameB) > 0.45; // 0.45 from paper
return util::jaccardSimi(nameA, nameB) > 0.45; // 0.45 from statsimi paper
}
// _____________________________________________________________________________
bool BTSClassifier::similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const {
UNUSED(posA);
UNUSED(posB);
return similar(nameA, nameB);
}
// _____________________________________________________________________________
bool BTSClassifier::similar(const std::string& nameA,
const std::string& nameB) const {
return util::btsSimi(nameA, nameB) > 0.85; // 0.85 from statsimi paper
}
// _____________________________________________________________________________
bool EDClassifier::similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const {
UNUSED(posA);
UNUSED(posB);
return similar(nameA, nameB);
}
// _____________________________________________________________________________
bool EDClassifier::similar(const std::string& nameA,
const std::string& nameB) const {
double edSimi = 1.0 - ((util::editDist(nameA, nameB) * 1.0) /
fmax(nameA.size(), nameB.size()));
return edSimi > 0.85; // 0.85 from statsimi paper
}
// _____________________________________________________________________________
bool PEDClassifier::similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const {
UNUSED(posA);
UNUSED(posB);
return similar(nameA, nameB);
}
// _____________________________________________________________________________
bool PEDClassifier::similar(const std::string& nameA,
const std::string& nameB) const {
double a = (util::prefixEditDist(nameA, nameB) * 1.0) / (nameA.size() * 1.0);
double b = (util::prefixEditDist(nameB, nameA) * 1.0) / (nameB.size() * 1.0);
double pedSimi = 1.0 - fmin(a, b);
return pedSimi > 0.875; // 0.875 average of values from statsimi paper
}

View file

@ -14,6 +14,7 @@ namespace statsimiclassifier {
class StatsimiClassifier {
public:
virtual ~StatsimiClassifier() {}
virtual bool similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const = 0;
@ -29,6 +30,30 @@ class JaccardClassifier : public StatsimiClassifier {
const std::string& nameB) const;
};
class BTSClassifier : public StatsimiClassifier {
public:
virtual bool similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const;
virtual bool similar(const std::string& nameA,
const std::string& nameB) const;
};
class EDClassifier : public StatsimiClassifier {
public:
virtual bool similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const;
virtual bool similar(const std::string& nameA,
const std::string& nameB) const;
};
class PEDClassifier : public StatsimiClassifier {
public:
virtual bool similar(const std::string& nameA, const POINT& posA,
const std::string& nameB, const POINT& posB) const;
virtual bool similar(const std::string& nameA,
const std::string& nameB) const;
};
} // namespace statsimiclassifier
} // namespace pfaedle

View file

@ -6,17 +6,19 @@
#define UTIL_STRING_H_
#include <algorithm>
#include <bitset>
#include <cassert>
#include <cmath>
#include <codecvt>
#include <cstring>
#include <exception>
#include <iomanip>
#include <iostream>
#include <locale>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#include <set>
namespace util {
@ -273,6 +275,12 @@ inline std::string implode(Iter begin, const Iter& end, const char* del) {
return ss.str();
}
// _____________________________________________________________________________
template <class T>
inline std::string implode(const std::vector<T>& vec, const char* del) {
return implode(vec.begin(), vec.end(), del);
}
// _____________________________________________________________________________
inline std::string normalizeWhiteSpace(const std::string& input) {
std::string ret;
@ -324,8 +332,9 @@ inline std::vector<std::string> tokenize(const std::string& str) {
}
// _____________________________________________________________________________
inline double jaccardSimi(const std::string& a,
const std::string& b) {
inline double jaccardSimi(const std::string& a, const std::string& b) {
if (a == b) return 1;
std::set<std::string> sa, sb;
auto toksA = tokenize(a);
@ -349,9 +358,83 @@ inline double jaccardSimi(const std::string& a,
}
// _____________________________________________________________________________
template <class T>
inline std::string implode(const std::vector<T>& vec, const char* del) {
return implode(vec.begin(), vec.end(), del);
inline double btsSimiInner(const std::vector<std::string>& toks,
const std::string& b, double best) {
std::set<std::string> toksSet;
toksSet.insert(toks.begin(), toks.end());
std::vector<std::string> toksUniqSorted;
toksUniqSorted.insert(toksUniqSorted.begin(), toksSet.begin(), toksSet.end());
assert(toksUniqSorted.size() <= 8);
for (uint8_t v = 1; v <= pow(2, toksUniqSorted.size()); v++) {
std::bitset<8> bs(v);
std::vector<std::string> cur(bs.count());
size_t i = 0;
for (size_t j = 0; j < toksUniqSorted.size(); j++) {
if (bs[j]) {
cur[i] = toksUniqSorted[j];
i++;
}
}
double tmp = util::implode(cur, " ").size();
// ed between the two string will always be at least their length
// difference - if this is already too big, skip it right now
double dt = 1 - (fabs(tmp - b.size()) * 1.0) / (fmax(tmp, b.size()) * 1.0);
if (dt <= best) continue;
// cur is guaranteed to be sorted now
do {
const auto& comb = util::implode(cur, " ");
double d =
1 - ((editDist(comb, b) * 1.0) / (fmax(comb.size(), b.size()) * 1.0));
if (fabs(d - 1) < 0.0001) return 1;
if (d > best) best = d;
} while (std::next_permutation(cur.begin(), cur.end()));
}
return best;
}
// _____________________________________________________________________________
inline double btsSimi(std::string a, std::string b) {
// this follows the implementation for the station similarity paper in
// https://github.com/ad-freiburg/statsimi/
if (a == b) return 1;
std::set<std::string> sa, sb;
auto toksA = tokenize(a);
auto toksB = tokenize(b);
// fallback to jaccard if the token set is too large
if (toksA.size() > 6 || toksB.size() > 6) {
return jaccardSimi(a, b);
}
if (toksA.size() > toksB.size()) {
std::swap(a, b);
std::swap(toksA, toksB);
}
// this is already our best known value - simply the edit
// distance similarity between the strings
double best = 1 - (editDist(a, b) * 1.0) / std::fmax(a.size(), b.size());
if (fabs(best) < 0.0001) return 0;
best = btsSimiInner(toksA, b, best);
if (fabs(best - 1) < 0.0001) return 1;
return btsSimiInner(toksB, a, best);
}
} // namespace util

View file

@ -55,6 +55,28 @@ int main(int argc, char** argv) {
{0, 5}}, 0.1) == approx(5));
}
// ___________________________________________________________________________
{
TEST(util::btsSimi("", ""), ==, approx(1));
TEST(util::btsSimi("Hallo", "Test"), ==, approx(0));
TEST(util::btsSimi("Test", "Hallo"), ==, approx(0));
TEST(util::btsSimi("Test", "Test"), ==, approx(1));
TEST(util::btsSimi("Milner Road / Wandlee Road", "Wandlee Road"), ==, approx(1));
TEST(util::btsSimi("bla blubb blob", "blubb blib"), ==, approx(0.9));
TEST(util::btsSimi("St Pancras International", "London St Pancras"), ==, approx(0.588235));
TEST(util::btsSimi("Reiterstraße", "Reiterstraße Freiburg im Breisgau"), ==, approx(1));
TEST(util::btsSimi("Reiterstraße", "Reiter Freiburg im Breisgau"), ==, approx(.466666666));
TEST(util::btsSimi("AA", "Reiterstraße, Freiburg im Breisgau"), ==, approx(0));
TEST(util::btsSimi("blibb blabbel bla blubb blob", "blubb blib blabb"), ==, approx(0.875));
TEST(util::btsSimi("blibb blabbel bla blubb blobo", "blubb blib blabb blabo"), ==, approx(0.84));
TEST(util::btsSimi("blubb blib blabb", "blibb blabbel bla blubb blob"), ==, approx(0.875));
TEST(util::btsSimi("blubbb blib blabb blobo", "blibb blabbel bla blubb blobo"), ==, approx(0.84));
TEST(util::btsSimi("Reiter Freiburg im Breisgau", "Reiter Frei burg im Brei sgau"), ==, approx(0.931034));
// fallback to jaccard
TEST(util::btsSimi("Freiburg im Breisgau, Germany, Main Railway Station", "Main Railway Station Freiburg im Breisgau, Germany"), ==, approx(1));
}
// ___________________________________________________________________________
{
std::string test = u8"Zürich, Hauptbahnhof (Nord)";