bts classifier
This commit is contained in:
parent
749044ce97
commit
516f04b9c5
9 changed files with 219 additions and 13 deletions
|
@ -7,6 +7,8 @@
|
||||||
routing_transition_penalty_fac: 0.0083
|
routing_transition_penalty_fac: 0.0083
|
||||||
routing_station_move_penalty_fac: 0.002
|
routing_station_move_penalty_fac: 0.002
|
||||||
|
|
||||||
|
station_similarity_classification_method: jaccard
|
||||||
|
|
||||||
# Regular expressions and station comparision is
|
# Regular expressions and station comparision is
|
||||||
# always case insensitive!
|
# always case insensitive!
|
||||||
station_normalize_chain:
|
station_normalize_chain:
|
||||||
|
|
|
@ -61,6 +61,10 @@ using pfaedle::router::RouterImpl;
|
||||||
using pfaedle::router::ShapeBuilder;
|
using pfaedle::router::ShapeBuilder;
|
||||||
using pfaedle::router::Stats;
|
using pfaedle::router::Stats;
|
||||||
using pfaedle::statsimiclassifier::JaccardClassifier;
|
using pfaedle::statsimiclassifier::JaccardClassifier;
|
||||||
|
using pfaedle::statsimiclassifier::StatsimiClassifier;
|
||||||
|
using pfaedle::statsimiclassifier::BTSClassifier;
|
||||||
|
using pfaedle::statsimiclassifier::EDClassifier;
|
||||||
|
using pfaedle::statsimiclassifier::PEDClassifier;
|
||||||
|
|
||||||
enum class RetCode {
|
enum class RetCode {
|
||||||
SUCCESS = 0,
|
SUCCESS = 0,
|
||||||
|
@ -277,7 +281,21 @@ int main(int argc, char** argv) {
|
||||||
graphDimensions[filePost].second += nd->getAdjListOut().size();
|
graphDimensions[filePost].second += nd->getAdjListOut().size();
|
||||||
}
|
}
|
||||||
|
|
||||||
JaccardClassifier statsimiClassifier;
|
StatsimiClassifier* statsimiClassifier;
|
||||||
|
|
||||||
|
if (motCfg.routingOpts.statsimiMethod == "bts") {
|
||||||
|
statsimiClassifier = new BTSClassifier();
|
||||||
|
} else if (motCfg.routingOpts.statsimiMethod == "jaccard") {
|
||||||
|
statsimiClassifier = new JaccardClassifier();
|
||||||
|
} else if (motCfg.routingOpts.statsimiMethod == "ed") {
|
||||||
|
statsimiClassifier = new EDClassifier();
|
||||||
|
} else if (motCfg.routingOpts.statsimiMethod == "ped") {
|
||||||
|
statsimiClassifier = new PEDClassifier();
|
||||||
|
} else {
|
||||||
|
LOG(ERROR) << "Unknown station similarity classifier "
|
||||||
|
<< motCfg.routingOpts.statsimiMethod;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
Router* router = 0;
|
Router* router = 0;
|
||||||
|
|
||||||
|
@ -303,7 +321,7 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ShapeBuilder shapeBuilder(>fs[0], usedMots, motCfg, &graph, &fStops,
|
ShapeBuilder shapeBuilder(>fs[0], usedMots, motCfg, &graph, &fStops,
|
||||||
&restr, &statsimiClassifier, router, cfg);
|
&restr, statsimiClassifier, router, cfg);
|
||||||
|
|
||||||
pfaedle::netgraph::Graph ng;
|
pfaedle::netgraph::Graph ng;
|
||||||
|
|
||||||
|
@ -326,6 +344,7 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (router) delete router;
|
if (router) delete router;
|
||||||
|
if (statsimiClassifier) delete statsimiClassifier;
|
||||||
|
|
||||||
if (cfg.writeGraph) {
|
if (cfg.writeGraph) {
|
||||||
LOG(INFO) << "Outputting graph.json...";
|
LOG(INFO) << "Outputting graph.json...";
|
||||||
|
|
|
@ -61,6 +61,13 @@ void MotConfigReader::parse(const std::vector<std::string>& paths,
|
||||||
cfg.routingOpts.transPenMethod = "exp";
|
cfg.routingOpts.transPenMethod = "exp";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (p.hasKey(secStr, "station_similarity_classification_method")) {
|
||||||
|
cfg.routingOpts.statsimiMethod =
|
||||||
|
p.getStr(secStr, "station_similarity_classification_method");
|
||||||
|
} else {
|
||||||
|
cfg.routingOpts.statsimiMethod = "bts";
|
||||||
|
}
|
||||||
|
|
||||||
if (p.hasKey(secStr, "routing_use_stations")) {
|
if (p.hasKey(secStr, "routing_use_stations")) {
|
||||||
cfg.routingOpts.useStations = p.getBool(secStr, "routing_use_stations");
|
cfg.routingOpts.useStations = p.getBool(secStr, "routing_use_stations");
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -62,6 +62,7 @@ struct RoutingOpts {
|
||||||
double transitionPen;
|
double transitionPen;
|
||||||
std::string transPenMethod;
|
std::string transPenMethod;
|
||||||
std::string emPenMethod;
|
std::string emPenMethod;
|
||||||
|
std::string statsimiMethod;
|
||||||
};
|
};
|
||||||
|
|
||||||
// _____________________________________________________________________________
|
// _____________________________________________________________________________
|
||||||
|
@ -82,6 +83,7 @@ inline bool operator==(const RoutingOpts& a, const RoutingOpts& b) {
|
||||||
fabs(a.nonStationPen - b.nonStationPen) < 0.01 &&
|
fabs(a.nonStationPen - b.nonStationPen) < 0.01 &&
|
||||||
a.transPenMethod == b.transPenMethod &&
|
a.transPenMethod == b.transPenMethod &&
|
||||||
a.emPenMethod == b.emPenMethod &&
|
a.emPenMethod == b.emPenMethod &&
|
||||||
|
a.statsimiMethod == b.statsimiMethod &&
|
||||||
a.useStations == b.useStations && a.popReachEdge == b.popReachEdge &&
|
a.useStations == b.useStations && a.popReachEdge == b.popReachEdge &&
|
||||||
a.noSelfHops == b.noSelfHops;
|
a.noSelfHops == b.noSelfHops;
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,6 @@ using pfaedle::router::ShapeBuilder;
|
||||||
using pfaedle::router::Stats;
|
using pfaedle::router::Stats;
|
||||||
using pfaedle::router::TripForests;
|
using pfaedle::router::TripForests;
|
||||||
using pfaedle::router::TripTrie;
|
using pfaedle::router::TripTrie;
|
||||||
using pfaedle::statsimiclassifier::JaccardClassifier;
|
|
||||||
using pfaedle::trgraph::EdgeGrid;
|
using pfaedle::trgraph::EdgeGrid;
|
||||||
using pfaedle::trgraph::NodeGrid;
|
using pfaedle::trgraph::NodeGrid;
|
||||||
using util::geo::latLngToWebMerc;
|
using util::geo::latLngToWebMerc;
|
||||||
|
|
|
@ -9,7 +9,10 @@
|
||||||
#include "pfaedle/statsimi-classifier/StatsimiClassifier.h"
|
#include "pfaedle/statsimi-classifier/StatsimiClassifier.h"
|
||||||
#include "util/geo/Geo.h"
|
#include "util/geo/Geo.h"
|
||||||
|
|
||||||
|
using pfaedle::statsimiclassifier::BTSClassifier;
|
||||||
|
using pfaedle::statsimiclassifier::EDClassifier;
|
||||||
using pfaedle::statsimiclassifier::JaccardClassifier;
|
using pfaedle::statsimiclassifier::JaccardClassifier;
|
||||||
|
using pfaedle::statsimiclassifier::PEDClassifier;
|
||||||
|
|
||||||
// _____________________________________________________________________________
|
// _____________________________________________________________________________
|
||||||
bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA,
|
bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA,
|
||||||
|
@ -23,8 +26,52 @@ bool JaccardClassifier::similar(const std::string& nameA, const POINT& posA,
|
||||||
// _____________________________________________________________________________
|
// _____________________________________________________________________________
|
||||||
bool JaccardClassifier::similar(const std::string& nameA,
|
bool JaccardClassifier::similar(const std::string& nameA,
|
||||||
const std::string& nameB) const {
|
const std::string& nameB) const {
|
||||||
// hard similarity
|
return util::jaccardSimi(nameA, nameB) > 0.45; // 0.45 from statsimi paper
|
||||||
if (nameA == nameB) return true;
|
}
|
||||||
|
|
||||||
return util::jaccardSimi(nameA, nameB) > 0.45; // 0.45 from paper
|
// _____________________________________________________________________________
|
||||||
|
bool BTSClassifier::similar(const std::string& nameA, const POINT& posA,
|
||||||
|
const std::string& nameB, const POINT& posB) const {
|
||||||
|
UNUSED(posA);
|
||||||
|
UNUSED(posB);
|
||||||
|
return similar(nameA, nameB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
bool BTSClassifier::similar(const std::string& nameA,
|
||||||
|
const std::string& nameB) const {
|
||||||
|
return util::btsSimi(nameA, nameB) > 0.85; // 0.85 from statsimi paper
|
||||||
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
bool EDClassifier::similar(const std::string& nameA, const POINT& posA,
|
||||||
|
const std::string& nameB, const POINT& posB) const {
|
||||||
|
UNUSED(posA);
|
||||||
|
UNUSED(posB);
|
||||||
|
return similar(nameA, nameB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
bool EDClassifier::similar(const std::string& nameA,
|
||||||
|
const std::string& nameB) const {
|
||||||
|
double edSimi = 1.0 - ((util::editDist(nameA, nameB) * 1.0) /
|
||||||
|
fmax(nameA.size(), nameB.size()));
|
||||||
|
return edSimi > 0.85; // 0.85 from statsimi paper
|
||||||
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
bool PEDClassifier::similar(const std::string& nameA, const POINT& posA,
|
||||||
|
const std::string& nameB, const POINT& posB) const {
|
||||||
|
UNUSED(posA);
|
||||||
|
UNUSED(posB);
|
||||||
|
return similar(nameA, nameB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
bool PEDClassifier::similar(const std::string& nameA,
|
||||||
|
const std::string& nameB) const {
|
||||||
|
double a = (util::prefixEditDist(nameA, nameB) * 1.0) / (nameA.size() * 1.0);
|
||||||
|
double b = (util::prefixEditDist(nameB, nameA) * 1.0) / (nameB.size() * 1.0);
|
||||||
|
double pedSimi = 1.0 - fmin(a, b);
|
||||||
|
return pedSimi > 0.875; // 0.875 average of values from statsimi paper
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,7 @@ namespace statsimiclassifier {
|
||||||
|
|
||||||
class StatsimiClassifier {
|
class StatsimiClassifier {
|
||||||
public:
|
public:
|
||||||
|
virtual ~StatsimiClassifier() {}
|
||||||
virtual bool similar(const std::string& nameA, const POINT& posA,
|
virtual bool similar(const std::string& nameA, const POINT& posA,
|
||||||
const std::string& nameB, const POINT& posB) const = 0;
|
const std::string& nameB, const POINT& posB) const = 0;
|
||||||
|
|
||||||
|
@ -29,6 +30,30 @@ class JaccardClassifier : public StatsimiClassifier {
|
||||||
const std::string& nameB) const;
|
const std::string& nameB) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class BTSClassifier : public StatsimiClassifier {
|
||||||
|
public:
|
||||||
|
virtual bool similar(const std::string& nameA, const POINT& posA,
|
||||||
|
const std::string& nameB, const POINT& posB) const;
|
||||||
|
virtual bool similar(const std::string& nameA,
|
||||||
|
const std::string& nameB) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
class EDClassifier : public StatsimiClassifier {
|
||||||
|
public:
|
||||||
|
virtual bool similar(const std::string& nameA, const POINT& posA,
|
||||||
|
const std::string& nameB, const POINT& posB) const;
|
||||||
|
virtual bool similar(const std::string& nameA,
|
||||||
|
const std::string& nameB) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
class PEDClassifier : public StatsimiClassifier {
|
||||||
|
public:
|
||||||
|
virtual bool similar(const std::string& nameA, const POINT& posA,
|
||||||
|
const std::string& nameB, const POINT& posB) const;
|
||||||
|
virtual bool similar(const std::string& nameA,
|
||||||
|
const std::string& nameB) const;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace statsimiclassifier
|
} // namespace statsimiclassifier
|
||||||
} // namespace pfaedle
|
} // namespace pfaedle
|
||||||
|
|
||||||
|
|
|
@ -6,17 +6,19 @@
|
||||||
#define UTIL_STRING_H_
|
#define UTIL_STRING_H_
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <bitset>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cmath>
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <exception>
|
#include <exception>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <set>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
|
||||||
|
|
||||||
namespace util {
|
namespace util {
|
||||||
|
|
||||||
|
@ -273,6 +275,12 @@ inline std::string implode(Iter begin, const Iter& end, const char* del) {
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
template <class T>
|
||||||
|
inline std::string implode(const std::vector<T>& vec, const char* del) {
|
||||||
|
return implode(vec.begin(), vec.end(), del);
|
||||||
|
}
|
||||||
|
|
||||||
// _____________________________________________________________________________
|
// _____________________________________________________________________________
|
||||||
inline std::string normalizeWhiteSpace(const std::string& input) {
|
inline std::string normalizeWhiteSpace(const std::string& input) {
|
||||||
std::string ret;
|
std::string ret;
|
||||||
|
@ -324,8 +332,9 @@ inline std::vector<std::string> tokenize(const std::string& str) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// _____________________________________________________________________________
|
// _____________________________________________________________________________
|
||||||
inline double jaccardSimi(const std::string& a,
|
inline double jaccardSimi(const std::string& a, const std::string& b) {
|
||||||
const std::string& b) {
|
if (a == b) return 1;
|
||||||
|
|
||||||
std::set<std::string> sa, sb;
|
std::set<std::string> sa, sb;
|
||||||
|
|
||||||
auto toksA = tokenize(a);
|
auto toksA = tokenize(a);
|
||||||
|
@ -349,9 +358,83 @@ inline double jaccardSimi(const std::string& a,
|
||||||
}
|
}
|
||||||
|
|
||||||
// _____________________________________________________________________________
|
// _____________________________________________________________________________
|
||||||
template <class T>
|
inline double btsSimiInner(const std::vector<std::string>& toks,
|
||||||
inline std::string implode(const std::vector<T>& vec, const char* del) {
|
const std::string& b, double best) {
|
||||||
return implode(vec.begin(), vec.end(), del);
|
std::set<std::string> toksSet;
|
||||||
|
toksSet.insert(toks.begin(), toks.end());
|
||||||
|
std::vector<std::string> toksUniqSorted;
|
||||||
|
toksUniqSorted.insert(toksUniqSorted.begin(), toksSet.begin(), toksSet.end());
|
||||||
|
|
||||||
|
assert(toksUniqSorted.size() <= 8);
|
||||||
|
|
||||||
|
for (uint8_t v = 1; v <= pow(2, toksUniqSorted.size()); v++) {
|
||||||
|
std::bitset<8> bs(v);
|
||||||
|
std::vector<std::string> cur(bs.count());
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (size_t j = 0; j < toksUniqSorted.size(); j++) {
|
||||||
|
if (bs[j]) {
|
||||||
|
cur[i] = toksUniqSorted[j];
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double tmp = util::implode(cur, " ").size();
|
||||||
|
|
||||||
|
// ed between the two string will always be at least their length
|
||||||
|
// difference - if this is already too big, skip it right now
|
||||||
|
double dt = 1 - (fabs(tmp - b.size()) * 1.0) / (fmax(tmp, b.size()) * 1.0);
|
||||||
|
|
||||||
|
if (dt <= best) continue;
|
||||||
|
|
||||||
|
// cur is guaranteed to be sorted now
|
||||||
|
do {
|
||||||
|
const auto& comb = util::implode(cur, " ");
|
||||||
|
|
||||||
|
double d =
|
||||||
|
1 - ((editDist(comb, b) * 1.0) / (fmax(comb.size(), b.size()) * 1.0));
|
||||||
|
|
||||||
|
if (fabs(d - 1) < 0.0001) return 1;
|
||||||
|
|
||||||
|
if (d > best) best = d;
|
||||||
|
} while (std::next_permutation(cur.begin(), cur.end()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
// _____________________________________________________________________________
|
||||||
|
inline double btsSimi(std::string a, std::string b) {
|
||||||
|
// this follows the implementation for the station similarity paper in
|
||||||
|
// https://github.com/ad-freiburg/statsimi/
|
||||||
|
if (a == b) return 1;
|
||||||
|
|
||||||
|
std::set<std::string> sa, sb;
|
||||||
|
|
||||||
|
auto toksA = tokenize(a);
|
||||||
|
auto toksB = tokenize(b);
|
||||||
|
|
||||||
|
// fallback to jaccard if the token set is too large
|
||||||
|
if (toksA.size() > 6 || toksB.size() > 6) {
|
||||||
|
return jaccardSimi(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (toksA.size() > toksB.size()) {
|
||||||
|
std::swap(a, b);
|
||||||
|
std::swap(toksA, toksB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// this is already our best known value - simply the edit
|
||||||
|
// distance similarity between the strings
|
||||||
|
double best = 1 - (editDist(a, b) * 1.0) / std::fmax(a.size(), b.size());
|
||||||
|
|
||||||
|
if (fabs(best) < 0.0001) return 0;
|
||||||
|
|
||||||
|
best = btsSimiInner(toksA, b, best);
|
||||||
|
|
||||||
|
if (fabs(best - 1) < 0.0001) return 1;
|
||||||
|
|
||||||
|
return btsSimiInner(toksB, a, best);
|
||||||
}
|
}
|
||||||
} // namespace util
|
} // namespace util
|
||||||
|
|
||||||
|
|
|
@ -55,6 +55,28 @@ int main(int argc, char** argv) {
|
||||||
{0, 5}}, 0.1) == approx(5));
|
{0, 5}}, 0.1) == approx(5));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ___________________________________________________________________________
|
||||||
|
{
|
||||||
|
TEST(util::btsSimi("", ""), ==, approx(1));
|
||||||
|
TEST(util::btsSimi("Hallo", "Test"), ==, approx(0));
|
||||||
|
TEST(util::btsSimi("Test", "Hallo"), ==, approx(0));
|
||||||
|
TEST(util::btsSimi("Test", "Test"), ==, approx(1));
|
||||||
|
TEST(util::btsSimi("Milner Road / Wandlee Road", "Wandlee Road"), ==, approx(1));
|
||||||
|
TEST(util::btsSimi("bla blubb blob", "blubb blib"), ==, approx(0.9));
|
||||||
|
TEST(util::btsSimi("St Pancras International", "London St Pancras"), ==, approx(0.588235));
|
||||||
|
TEST(util::btsSimi("Reiterstraße", "Reiterstraße Freiburg im Breisgau"), ==, approx(1));
|
||||||
|
TEST(util::btsSimi("Reiterstraße", "Reiter Freiburg im Breisgau"), ==, approx(.466666666));
|
||||||
|
TEST(util::btsSimi("AA", "Reiterstraße, Freiburg im Breisgau"), ==, approx(0));
|
||||||
|
TEST(util::btsSimi("blibb blabbel bla blubb blob", "blubb blib blabb"), ==, approx(0.875));
|
||||||
|
TEST(util::btsSimi("blibb blabbel bla blubb blobo", "blubb blib blabb blabo"), ==, approx(0.84));
|
||||||
|
TEST(util::btsSimi("blubb blib blabb", "blibb blabbel bla blubb blob"), ==, approx(0.875));
|
||||||
|
TEST(util::btsSimi("blubbb blib blabb blobo", "blibb blabbel bla blubb blobo"), ==, approx(0.84));
|
||||||
|
TEST(util::btsSimi("Reiter Freiburg im Breisgau", "Reiter Frei burg im Brei sgau"), ==, approx(0.931034));
|
||||||
|
// fallback to jaccard
|
||||||
|
TEST(util::btsSimi("Freiburg im Breisgau, Germany, Main Railway Station", "Main Railway Station Freiburg im Breisgau, Germany"), ==, approx(1));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// ___________________________________________________________________________
|
// ___________________________________________________________________________
|
||||||
{
|
{
|
||||||
std::string test = u8"Zürich, Hauptbahnhof (Nord)";
|
std::string test = u8"Zürich, Hauptbahnhof (Nord)";
|
||||||
|
|
Loading…
Reference in a new issue