-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransform_raw_data_into_graph.py
50 lines (41 loc) · 1.24 KB
/
transform_raw_data_into_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import numpy as np
from multiprocessing import Manager, Process
import collections
import matplotlib.pyplot as plt
file_name = '/users/kezhon0/trans_3000p2_list.txt'
data = pd.read_csv(file_name)
data_len = len(data)
src_nodes = data.iloc[:, 1] + data.iloc[:, 2]
dst_nodes = data.iloc[:, 3] + data.iloc[:, 4]
valid_trans = (src_nodes != dst_nodes)
src_nodes = valid_trans*src_nodes
src_nodes = src_nodes.mask(src_nodes.eq('')).dropna()
dst_nodes = valid_trans*dst_nodes
dst_nodes = dst_nodes.mask(dst_nodes.eq('')).dropna()
print('total src nodes', len(src_nodes))
assert(len(src_nodes) == len(dst_nodes))
unique_nodes = np.unique(np.append(src_nodes.unique(), dst_nodes.unique()))
graph = pd.concat([src_nodes, dst_nodes], axis=1)
print(graph)
idx_dict = {}
idx = 1
for v in unique_nodes:
idx_dict[v] = idx
idx += 1
outfile = 'raw_graph.txt'
fout = open(outfile, "w")
graph_len = len(graph)
print("before dedup", graph_len)
graph = graph.drop_duplicates()
graph_len = len(graph)
print("after dedup", graph_len)
for i in range(graph_len):
src = graph.iloc[i,0]
dst = graph.iloc[i,1]
fout.write(str(idx_dict[src]))
fout.write(' ')
fout.write(str(idx_dict[dst]))
fout.write('\n')
fout.flush()
fout.close()