initial commit

This commit is contained in:
Patrick Brosi 2018-06-09 17:14:08 +02:00
commit efcd3e1892
106 changed files with 27000 additions and 0 deletions

View file

@ -0,0 +1,302 @@
// Copyright 2018, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Authors: Patrick Brosi <brosi@informatik.uni-freiburg.de>
#include <stdio.h>
#include <unistd.h>
#include <string>
#include <algorithm>
#include <cassert>
#include <climits>
#include <cstdlib>
#include <cstring>
#include <exception>
#include <iostream>
#include <sstream>
#include "pfaedle/osm/OsmIdSet.h"
using pfaedle::osm::OsmIdSet;
size_t OsmIdSet::LOOKUPS = 0;
size_t OsmIdSet::FLOOKUPS = 0;
// _____________________________________________________________________________
OsmIdSet::OsmIdSet()
: _closed(false),
_sorted(true),
_last(0),
_smallest(-1),
_biggest(0),
_obufpos(0),
_curBlock(-1),
_fsize(0) {
_bitset = new std::bitset<BLOOMF_BITS>();
_file = openTmpFile();
_buffer = new unsigned char[BUFFER_S];
_outBuffer = new unsigned char[OBUFFER_S];
}
// _____________________________________________________________________________
OsmIdSet::~OsmIdSet() {
delete _bitset;
delete[] _buffer;
if (!_closed) delete[] _outBuffer;
}
// _____________________________________________________________________________
void OsmIdSet::add(osmid id) {
if (_closed) throw std::exception();
diskAdd(id);
// _set.insert(id);
if (_last > id) _sorted = false;
_last = id;
if (id < _smallest) _smallest = id;
if (id > _biggest) _biggest = id;
for (int i = 0; i < 10; i++) (*_bitset)[hash(id, i)] = 1;
}
// _____________________________________________________________________________
void OsmIdSet::diskAdd(osmid id) {
memcpy(_outBuffer + _obufpos, &id, 8);
_obufpos += 8;
if (_obufpos % BUFFER_S == 0) {
// this is the last value in this block
_blockEnds.push_back(id);
}
if (_obufpos >= OBUFFER_S) {
ssize_t w = cwrite(_file, _outBuffer, OBUFFER_S);
_fsize += w;
_obufpos = 0;
}
}
// _____________________________________________________________________________
size_t OsmIdSet::getBlock(osmid id) const {
auto it = std::upper_bound(_blockEnds.begin(), _blockEnds.end(), id);
return (it - _blockEnds.begin());
}
// _____________________________________________________________________________
bool OsmIdSet::diskHas(osmid id) const {
assert(_sorted);
if (std::find(_blockEnds.begin(), _blockEnds.end(), id) != _blockEnds.end()) {
return true;
}
size_t block = getBlock(id);
if (block != _curBlock) {
lseek(_file, block * BUFFER_S, SEEK_SET);
ssize_t n = cread(_file, _buffer, BUFFER_S);
_curBlockSize = n;
FLOOKUPS++;
_curBlock = block;
}
if (_curBlockSize <= 7) return false;
if (*(reinterpret_cast<uint64_t*>(_buffer)) > id) return false;
ssize_t l = 0;
ssize_t r = _curBlockSize - 8;
while (l <= r) {
unsigned char* p = _buffer + (l + ((r - l) / 16) * 8);
osmid cur = *(reinterpret_cast<uint64_t*>(p));
if (cur == id) return true;
if (cur < id)
l = (p - _buffer) + 8;
else
r = (p - _buffer) - 8;
}
return false;
}
// _____________________________________________________________________________
bool OsmIdSet::has(osmid id) const {
LOOKUPS++;
if (!_closed) close();
if (id < _smallest || id > _biggest) {
return false;
}
for (int i = 0; i < 10; i++) {
if ((*_bitset)[hash(id, i)] == 0) return false;
}
bool has = diskHas(id);
// assert(has == (bool)_set.count(id));
return has;
}
// _____________________________________________________________________________
void OsmIdSet::close() const {
ssize_t w = cwrite(_file, _outBuffer, _obufpos);
_fsize += w;
_blockEnds.push_back(_biggest);
delete[] _outBuffer;
_closed = true;
// if order was not sorted, sort now
if (!_sorted) sort();
}
// _____________________________________________________________________________
void OsmIdSet::sort() const {
// sort file via an external merge sort
_blockEnds.clear();
size_t parts = _fsize / SORT_BUFFER_S + 1;
size_t partsBufSize = ((SORT_BUFFER_S / 8) / parts + 1) * 8;
unsigned char* buf = new unsigned char[SORT_BUFFER_S];
unsigned char** partbufs = new unsigned char*[parts];
size_t* partpos = new size_t[parts];
size_t* partsize = new size_t[parts];
// sort the 'parts' number of file parts independently
for (size_t i = 0; i < parts; i++) {
partbufs[i] = new unsigned char[partsBufSize];
partpos[i] = 0;
partsize[i] = 0;
lseek(_file, SORT_BUFFER_S * i, SEEK_SET);
ssize_t n = read(_file, buf, SORT_BUFFER_S);
if (n < 0) continue;
qsort(buf, n / 8, 8, qsortCmp);
lseek(_file, SORT_BUFFER_S * i, SEEK_SET);
cwrite(_file, buf, n);
memcpy(partbufs[i], buf, std::min<size_t>(n, partsBufSize));
partsize[i] = n;
}
// now the individial parts are sorted
int newFile = openTmpFile();
for (size_t i = 0; i < _fsize; i += 8) {
uint64_t smallest = UINT64_MAX;
ssize_t smallestP = -1;
// look for smallest element (not optimal, but running time is not
// really critical here)
for (size_t j = 0; j < parts; j++) {
if (partpos[j] == partsize[j]) continue; // bucket already empty
if (*reinterpret_cast<uint64_t*>(
&partbufs[j][partpos[j] % partsBufSize]) <= smallest) {
smallestP = j;
smallest = *reinterpret_cast<uint64_t*>(
&partbufs[j][partpos[j] % partsBufSize]);
}
}
assert(smallestP > -1);
memcpy(buf + (i % SORT_BUFFER_S), &smallest, 8);
if ((i + 8) % BUFFER_S == 0) _blockEnds.push_back(smallest);
if ((i % SORT_BUFFER_S) == SORT_BUFFER_S - 8 || i == _fsize - 8) {
// write to output file
cwrite(newFile, buf, i % SORT_BUFFER_S + 8);
}
partpos[smallestP] += 8;
if (partpos[smallestP] % partsBufSize == 0) {
lseek(_file, SORT_BUFFER_S * smallestP + partpos[smallestP], SEEK_SET);
cread(_file, partbufs[smallestP], partsBufSize);
}
}
// cleanup
delete[] buf;
for (size_t j = 0; j < parts; j++) delete[] partbufs[j];
delete[] partbufs;
delete[] partpos;
delete[] partsize;
_file = newFile;
_sorted = true;
}
// _____________________________________________________________________________
size_t OsmIdSet::cwrite(int f, const void* buf, size_t n) const {
ssize_t w = write(f, buf, n);
if (w < 0) {
throw std::runtime_error("OSMIDSET: could not write to tmp file.\n");
}
return w;
}
// _____________________________________________________________________________
size_t OsmIdSet::cread(int f, void* buf, size_t n) const {
ssize_t w = read(f, buf, n);
if (w < 0) {
throw std::runtime_error("OSMIDSET: could not read from tmp file.\n");
}
return w;
}
// _____________________________________________________________________________
uint32_t OsmIdSet::knuth(uint32_t in) const {
const uint32_t prime = 2654435769;
return (in * prime) >> 2;
}
// _____________________________________________________________________________
uint32_t OsmIdSet::jenkins(uint32_t in) const {
in = (in + 0x7ed55d16) + (in << 12);
in = (in ^ 0xc761c23c) ^ (in >> 19);
in = (in + 0x165667b1) + (in << 5);
in = (in + 0xd3a2646c) ^ (in << 9);
in = (in + 0xfd7046c5) + (in << 3);
in = (in ^ 0xb55a4f09) ^ (in >> 16);
return in >> 2;
}
// _____________________________________________________________________________
uint32_t OsmIdSet::hash(uint32_t in, int i) const {
return (knuth(in) + jenkins(in) * i) % BLOOMF_BITS;
}
// _____________________________________________________________________________
int OsmIdSet::openTmpFile() const {
const std::string& fname = getFName();
int file = open(fname.c_str(), O_RDWR | O_CREAT, 0666);
// immediately unlink
unlink(fname.c_str());
if (file < 0) {
std::cerr << "Could not open temporary file " << fname << std::endl;
exit(1);
}
posix_fadvise(file, 0, 0, POSIX_FADV_SEQUENTIAL);
return file;
}
// _____________________________________________________________________________
std::string OsmIdSet::getFName() const {
std::string f = ".pfaedle-tmp";
while (access(f.c_str(), F_OK) != -1) {
std::stringstream ss;
ss << ".pfaedle-tmp-";
ss << std::rand();
f = ss.str().c_str();
}
return f;
}