-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_outliers_step1.py
executable file
·102 lines (85 loc) · 3.29 KB
/
clean_outliers_step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os.path
from pathlib import Path
import pandas as pd
import random
import csv
from analyzer.helpers import export_to_csv
import analyzer.config as conf
from analyzer.utils import (
get_full_commit_url,
parse_commit_hash_by_project,
get_change_id_from_commit_msg,
get_bug_id_from_commit_msg,
)
IO_DIR = "io/artifacts"
# For commons-lang and joda-time
commits_to_ignore = {
"commons-lang": [
"4f3b6e55f86c8b59ea9b3991ca055c3905eb05a1",
],
"joda-time": ["72b22654962284bac59c2777131b42a2d1f53228"],
}
for project in ["commons-lang", "joda-time", "cts"]:
full_input_file_path = Path(f"{IO_DIR}/{project}/hydrated_{project}-step1.csv")
full_output_file_path = Path(
f"{IO_DIR}/{project}/hydrated_{project}-step1_refined.csv"
)
if not os.path.exists(f"{full_input_file_path}"):
print(f"File does not exist: {full_input_file_path}")
continue
df = pd.read_csv(f"{full_input_file_path}")
print("Start Size:", df.shape)
df = df.iloc[:, 0:11]
if project != "cts":
to_ignore_commits = list(
map(
lambda each: parse_commit_hash_by_project(
project,
each,
),
commits_to_ignore[project],
)
)
print(to_ignore_commits)
df = df[~df["Hash"].isin(to_ignore_commits)]
else:
# Check for duplicated change id
print("Step: 1")
unique_changeid = []
unique_changeid_hash = []
to_be_dropped_commit_hash = []
for index, row in df.iterrows():
if "Change-Id: " in row["Commit Msg"]:
change_id = get_change_id_from_commit_msg(row["Commit Msg"])
# print(change_id)
if change_id in unique_changeid:
if row["Hash"] in unique_changeid_hash:
pass
else:
if row["Hash"] not in to_be_dropped_commit_hash:
to_be_dropped_commit_hash.append(row["Hash"])
else:
unique_changeid.append(change_id)
if row["Hash"] not in unique_changeid_hash:
unique_changeid_hash.append(row["Hash"])
print("Commits to be dropped: ", len(to_be_dropped_commit_hash))
for hash in to_be_dropped_commit_hash:
to_be_dropped_index = df[(df["Hash"] == hash)].index
df.drop(to_be_dropped_index, inplace=True)
print("Step 1 Size:", df.shape)
# Check for merged from
print("Step: 2")
to_be_dropped_commit_hash = []
for index, row in df.iterrows():
if "merged from:" in row["Commit Msg"]:
if row["Hash"] not in to_be_dropped_commit_hash:
to_be_dropped_commit_hash.append(row["Hash"])
print("Commits to be dropped: ", len(to_be_dropped_commit_hash))
for hash in to_be_dropped_commit_hash:
to_be_dropped_index = df[(df["Hash"] == hash)].index
df.drop(to_be_dropped_index, inplace=True)
print("Step 2 Size:", df.shape)
# Finally export the refined dataframe
print("Final Size:", df.shape)
df.to_csv(f"{full_output_file_path}", index=False)
print(f"Generated {full_output_file_path}")