-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathlogloader.py
131 lines (122 loc) · 4.55 KB
/
logloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
import pandas as pd
import re
import multiprocessing as mp
from itertools import groupby, count, chain
import numpy as np
import json
import os
import io
import time
class LogLoader(object):
def __init__(self, headLength, isMulti, headRegex, maxLength):
self.headLength = headLength
self.isMulti = isMulti
if (headRegex):
self.headRegex = re.compile(headRegex)
self.maxLength = maxLength
self.splitregex = re.compile(r'(\s+|\|)')
def formalize_message(self, lines):
def get_content(line):
count = 0
in_head = False
for idx, i in enumerate(line):
#print(i)
if not self.splitregex.search(i):
if not (in_head):
count += 1
in_head = True
else:
in_head = False
if (self.headLength + 1 == count):
return line[idx:].strip()
return line.strip()
#print("{}: count -> {}".format(line, count))
def get_head(line_seg, headers, delimer):
head_count = 0
for idx, se in enumerate(line_seg):
if (head_count >= self.headLength):
break
if (idx % 2 == 0):
headers[head_count].append(se)
else:
delimer[head_count].append(se)
head_count += 1
def get_segment(line):
temp_seg = []
spliter = ""
for i in self.splitregex.split(line):
if i == "":
continue
if (self.splitregex.search(i)):
spliter += i
else:
temp_seg.append(spliter)
spliter = ""
return temp_seg
log_messages = []
count = 0
fail_count = 0
headers = dict()
header_delimer = dict()
for i in range(0, self.headLength):
headers[i] = []
header_delimer[i] = []
if (self.isMulti):
start = True
now_res = ""
for line in lines:
if not line.strip():
fail_count += 1
continue
line_seg = self.splitregex.split(line.strip())
match = self.headRegex.search(line_seg[0])
content_line = get_content(line)
if match: #New start
get_head(line_seg, headers, header_delimer)
if(start):
start = False
now_res = content_line
else:
if (len(now_res) > self.maxLength):
fail_count += 1
continue
log_messages.append(now_res)
now_res = content_line
count += 1
else: #Continue
if(start):
fail_count += 1
continue
else:
now_res += "\n" + line.strip()
else:
for line in lines:
line_seg = self.splitregex.split(line.strip())
if not line.strip():
fail_count += 1
continue
get_head(line_seg, headers, header_delimer)
content_line = get_content(line)
if (len(content_line) > self.maxLength):
fail_count += 1
continue
log_messages.append(content_line)
count += 1
return log_messages, fail_count, headers, header_delimer
def load_to_dataframe(self, log_filepath):
""" Function to transform log file to dataframe
"""
print('Loading log messages to dataframe...')
t1 = time.time()
lines = []
with open(log_filepath, 'r', encoding="utf-8", errors="ignore") as fid:
lines = fid.readlines()
print("Total lines {}".format(len(lines)))
log_messages = []
log_messages, failed_size, headers, head_delimer = self.formalize_message(lines)
log_dataframe = pd.DataFrame(log_messages, columns=['Content'])
print("Success load logs#: {}, Failed load lines#: {}".format(len(log_messages), failed_size))
t2 = time.time()
print('Time taken {:.2f}s'.format(t2-t1))
return log_dataframe, headers, head_delimer