forked from berthubert/hello-dl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextsupport.cc
72 lines (60 loc) · 1.78 KB
/
textsupport.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include "textsupport.hh"
#include <iostream>
#include <array>
#include <fstream>
#include <vector>
#include <map>
#include <algorithm>
#include <random>
using namespace std;
vector<string> textChopper(const char* fname, size_t siz, int mult)
{
ifstream ifs(fname);
vector<char> buffer(1024000);
string total;
while(!ifs.eof()) {
ifs.read(&buffer[0], buffer.size());
total.append(&buffer[0], &buffer[ifs.gcount()]);
}
buffer.clear();
unsigned int pieces = mult*total.size()/siz;
vector<string> ret;
ret.reserve(pieces);
std::random_device rd; //Will be used to obtain a seed for the random number engine
std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
std::uniform_int_distribution<> distrib(0, total.size() - siz -1);
for(unsigned int n = 0 ; n < pieces; ++n) {
ret.push_back(total.substr(distrib(gen), siz));
for(auto& c : *ret.rbegin())
if(c=='\n' || c=='\t') c=' ';
}
return ret;
}
BiMapper::BiMapper(const char* fname, int lim)
{
ifstream ifs(fname);
std::array<unsigned char, 4096> a;
std::unordered_map<int, int> popcount;
while(!ifs.eof()) {
ifs.read((char*)&a[0], a.size());
for(const auto& c : a) {
//cout<<c;
if(c<127)
popcount[c]++;
}
}
vector<pair<int,int>> revcount;
for(const auto& p : popcount)
revcount.push_back(p);
sort(revcount.begin(), revcount.end(), [](const auto& a, const auto& b) {
return b.second < a.second;
});
if(lim >= 0 && revcount.size() > (unsigned int)lim)
revcount.resize(lim);
for(unsigned int n=0; n < revcount.size(); ++n) {
d_c2i[revcount[n].first] = n;
d_i2c[n]=revcount[n].first;
// cout<<(char)revcount[n].first <<" -> "<<n<<"\n";
}
cout<<"Assigned "<<d_c2i.size()<<" mappings"<<endl;
}