initial commit
This commit is contained in:
commit
efcd3e1892
106 changed files with 27000 additions and 0 deletions
302
src/pfaedle/osm/OsmIdSet.cpp
Normal file
302
src/pfaedle/osm/OsmIdSet.cpp
Normal file
|
|
@ -0,0 +1,302 @@
|
|||
// Copyright 2018, University of Freiburg,
|
||||
// Chair of Algorithms and Data Structures.
|
||||
// Authors: Patrick Brosi <brosi@informatik.uni-freiburg.de>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <climits>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "pfaedle/osm/OsmIdSet.h"
|
||||
|
||||
using pfaedle::osm::OsmIdSet;
|
||||
|
||||
size_t OsmIdSet::LOOKUPS = 0;
|
||||
size_t OsmIdSet::FLOOKUPS = 0;
|
||||
|
||||
// _____________________________________________________________________________
|
||||
OsmIdSet::OsmIdSet()
|
||||
: _closed(false),
|
||||
_sorted(true),
|
||||
_last(0),
|
||||
_smallest(-1),
|
||||
_biggest(0),
|
||||
_obufpos(0),
|
||||
_curBlock(-1),
|
||||
_fsize(0) {
|
||||
_bitset = new std::bitset<BLOOMF_BITS>();
|
||||
_file = openTmpFile();
|
||||
|
||||
_buffer = new unsigned char[BUFFER_S];
|
||||
_outBuffer = new unsigned char[OBUFFER_S];
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
OsmIdSet::~OsmIdSet() {
|
||||
delete _bitset;
|
||||
delete[] _buffer;
|
||||
if (!_closed) delete[] _outBuffer;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
void OsmIdSet::add(osmid id) {
|
||||
if (_closed) throw std::exception();
|
||||
diskAdd(id);
|
||||
// _set.insert(id);
|
||||
|
||||
if (_last > id) _sorted = false;
|
||||
_last = id;
|
||||
if (id < _smallest) _smallest = id;
|
||||
if (id > _biggest) _biggest = id;
|
||||
|
||||
for (int i = 0; i < 10; i++) (*_bitset)[hash(id, i)] = 1;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
void OsmIdSet::diskAdd(osmid id) {
|
||||
memcpy(_outBuffer + _obufpos, &id, 8);
|
||||
|
||||
_obufpos += 8;
|
||||
|
||||
if (_obufpos % BUFFER_S == 0) {
|
||||
// this is the last value in this block
|
||||
_blockEnds.push_back(id);
|
||||
}
|
||||
|
||||
if (_obufpos >= OBUFFER_S) {
|
||||
ssize_t w = cwrite(_file, _outBuffer, OBUFFER_S);
|
||||
_fsize += w;
|
||||
_obufpos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
size_t OsmIdSet::getBlock(osmid id) const {
|
||||
auto it = std::upper_bound(_blockEnds.begin(), _blockEnds.end(), id);
|
||||
return (it - _blockEnds.begin());
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
bool OsmIdSet::diskHas(osmid id) const {
|
||||
assert(_sorted);
|
||||
|
||||
if (std::find(_blockEnds.begin(), _blockEnds.end(), id) != _blockEnds.end()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t block = getBlock(id);
|
||||
|
||||
if (block != _curBlock) {
|
||||
lseek(_file, block * BUFFER_S, SEEK_SET);
|
||||
|
||||
ssize_t n = cread(_file, _buffer, BUFFER_S);
|
||||
_curBlockSize = n;
|
||||
FLOOKUPS++;
|
||||
_curBlock = block;
|
||||
}
|
||||
|
||||
if (_curBlockSize <= 7) return false;
|
||||
if (*(reinterpret_cast<uint64_t*>(_buffer)) > id) return false;
|
||||
|
||||
ssize_t l = 0;
|
||||
ssize_t r = _curBlockSize - 8;
|
||||
|
||||
while (l <= r) {
|
||||
unsigned char* p = _buffer + (l + ((r - l) / 16) * 8);
|
||||
osmid cur = *(reinterpret_cast<uint64_t*>(p));
|
||||
if (cur == id) return true;
|
||||
if (cur < id)
|
||||
l = (p - _buffer) + 8;
|
||||
else
|
||||
r = (p - _buffer) - 8;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
bool OsmIdSet::has(osmid id) const {
|
||||
LOOKUPS++;
|
||||
if (!_closed) close();
|
||||
|
||||
if (id < _smallest || id > _biggest) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
if ((*_bitset)[hash(id, i)] == 0) return false;
|
||||
}
|
||||
|
||||
bool has = diskHas(id);
|
||||
// assert(has == (bool)_set.count(id));
|
||||
return has;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
void OsmIdSet::close() const {
|
||||
ssize_t w = cwrite(_file, _outBuffer, _obufpos);
|
||||
_fsize += w;
|
||||
_blockEnds.push_back(_biggest);
|
||||
delete[] _outBuffer;
|
||||
_closed = true;
|
||||
|
||||
// if order was not sorted, sort now
|
||||
if (!_sorted) sort();
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
void OsmIdSet::sort() const {
|
||||
// sort file via an external merge sort
|
||||
|
||||
_blockEnds.clear();
|
||||
size_t parts = _fsize / SORT_BUFFER_S + 1;
|
||||
size_t partsBufSize = ((SORT_BUFFER_S / 8) / parts + 1) * 8;
|
||||
|
||||
unsigned char* buf = new unsigned char[SORT_BUFFER_S];
|
||||
unsigned char** partbufs = new unsigned char*[parts];
|
||||
size_t* partpos = new size_t[parts];
|
||||
size_t* partsize = new size_t[parts];
|
||||
|
||||
// sort the 'parts' number of file parts independently
|
||||
for (size_t i = 0; i < parts; i++) {
|
||||
partbufs[i] = new unsigned char[partsBufSize];
|
||||
partpos[i] = 0;
|
||||
partsize[i] = 0;
|
||||
lseek(_file, SORT_BUFFER_S * i, SEEK_SET);
|
||||
ssize_t n = read(_file, buf, SORT_BUFFER_S);
|
||||
if (n < 0) continue;
|
||||
qsort(buf, n / 8, 8, qsortCmp);
|
||||
lseek(_file, SORT_BUFFER_S * i, SEEK_SET);
|
||||
cwrite(_file, buf, n);
|
||||
|
||||
memcpy(partbufs[i], buf, std::min<size_t>(n, partsBufSize));
|
||||
partsize[i] = n;
|
||||
}
|
||||
|
||||
// now the individial parts are sorted
|
||||
int newFile = openTmpFile();
|
||||
|
||||
for (size_t i = 0; i < _fsize; i += 8) {
|
||||
uint64_t smallest = UINT64_MAX;
|
||||
ssize_t smallestP = -1;
|
||||
|
||||
// look for smallest element (not optimal, but running time is not
|
||||
// really critical here)
|
||||
for (size_t j = 0; j < parts; j++) {
|
||||
if (partpos[j] == partsize[j]) continue; // bucket already empty
|
||||
if (*reinterpret_cast<uint64_t*>(
|
||||
&partbufs[j][partpos[j] % partsBufSize]) <= smallest) {
|
||||
smallestP = j;
|
||||
smallest = *reinterpret_cast<uint64_t*>(
|
||||
&partbufs[j][partpos[j] % partsBufSize]);
|
||||
}
|
||||
}
|
||||
|
||||
assert(smallestP > -1);
|
||||
|
||||
memcpy(buf + (i % SORT_BUFFER_S), &smallest, 8);
|
||||
|
||||
if ((i + 8) % BUFFER_S == 0) _blockEnds.push_back(smallest);
|
||||
|
||||
if ((i % SORT_BUFFER_S) == SORT_BUFFER_S - 8 || i == _fsize - 8) {
|
||||
// write to output file
|
||||
cwrite(newFile, buf, i % SORT_BUFFER_S + 8);
|
||||
}
|
||||
|
||||
partpos[smallestP] += 8;
|
||||
|
||||
if (partpos[smallestP] % partsBufSize == 0) {
|
||||
lseek(_file, SORT_BUFFER_S * smallestP + partpos[smallestP], SEEK_SET);
|
||||
cread(_file, partbufs[smallestP], partsBufSize);
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
delete[] buf;
|
||||
for (size_t j = 0; j < parts; j++) delete[] partbufs[j];
|
||||
delete[] partbufs;
|
||||
delete[] partpos;
|
||||
delete[] partsize;
|
||||
|
||||
_file = newFile;
|
||||
_sorted = true;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
size_t OsmIdSet::cwrite(int f, const void* buf, size_t n) const {
|
||||
ssize_t w = write(f, buf, n);
|
||||
if (w < 0) {
|
||||
throw std::runtime_error("OSMIDSET: could not write to tmp file.\n");
|
||||
}
|
||||
|
||||
return w;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
size_t OsmIdSet::cread(int f, void* buf, size_t n) const {
|
||||
ssize_t w = read(f, buf, n);
|
||||
if (w < 0) {
|
||||
throw std::runtime_error("OSMIDSET: could not read from tmp file.\n");
|
||||
}
|
||||
|
||||
return w;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
uint32_t OsmIdSet::knuth(uint32_t in) const {
|
||||
const uint32_t prime = 2654435769;
|
||||
return (in * prime) >> 2;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
uint32_t OsmIdSet::jenkins(uint32_t in) const {
|
||||
in = (in + 0x7ed55d16) + (in << 12);
|
||||
in = (in ^ 0xc761c23c) ^ (in >> 19);
|
||||
in = (in + 0x165667b1) + (in << 5);
|
||||
in = (in + 0xd3a2646c) ^ (in << 9);
|
||||
in = (in + 0xfd7046c5) + (in << 3);
|
||||
in = (in ^ 0xb55a4f09) ^ (in >> 16);
|
||||
return in >> 2;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
uint32_t OsmIdSet::hash(uint32_t in, int i) const {
|
||||
return (knuth(in) + jenkins(in) * i) % BLOOMF_BITS;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
int OsmIdSet::openTmpFile() const {
|
||||
const std::string& fname = getFName();
|
||||
int file = open(fname.c_str(), O_RDWR | O_CREAT, 0666);
|
||||
|
||||
// immediately unlink
|
||||
unlink(fname.c_str());
|
||||
|
||||
if (file < 0) {
|
||||
std::cerr << "Could not open temporary file " << fname << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
posix_fadvise(file, 0, 0, POSIX_FADV_SEQUENTIAL);
|
||||
return file;
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
std::string OsmIdSet::getFName() const {
|
||||
std::string f = ".pfaedle-tmp";
|
||||
|
||||
while (access(f.c_str(), F_OK) != -1) {
|
||||
std::stringstream ss;
|
||||
ss << ".pfaedle-tmp-";
|
||||
ss << std::rand();
|
||||
f = ss.str().c_str();
|
||||
}
|
||||
|
||||
return f;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue