Skip to content

Commit c1b12df

Browse files
authored
Merge pull request #5 from peopledoc/setup_project_tuto
Setup project tuto [WIP]
2 parents af59f51 + 732fbd9 commit c1b12df

File tree

110 files changed

+2874
-192
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+2874
-192
lines changed

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@
22
**/.ipynb_checkpoints/*
33
__pycache__/
44
data/*
5-
5+
*.idea
6+
*.pytest_cache
7+
venv
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"epoch": 20,
3+
"learning_rate": 0.7
4+
}

resources/setup_project/data/input/trip_advisor.json

+1
Large diffs are not rendered by default.
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FROM python:3.6
2+
3+
RUN apt-get update && apt-get install -y tree \
4+
nano \
5+
vim \
6+
virtualenv \
7+
python3-dev
8+
9+
RUN git config --global user.name tuto_user
10+
RUN git config --global user.email tuto_user@example.com
11+
12+
WORKDIR /tuto

resources/setup_project/docker/run.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
IMG_NAME="setup_project_tuto"
3+
4+
docker build -t $IMG_NAME $(dirname $0)
5+
6+
docker run -v $(git rev-parse --show-toplevel):/tuto -it $IMG_NAME bash
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
venv
2+
*.egg-info
3+
__pycache__/
4+
**.ipynb_checkpoints
5+
*.pytest_cache
6+
*.idea
+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
.PHONY: help setup
2+
3+
#: help - Display callable targets.
4+
help:
5+
@echo "Reference card for usual actions in development environment."
6+
@echo "Here are available targets:"
7+
@egrep -o "^#: (.+)" [Mm]akefile | sed 's/#: /* /'
8+
9+
10+
#: setup - Install dependencies.
11+
setup:
12+
pip install cython
13+
pip install -e . -r ./requirements.txt

resources/setup_project/project/classifier/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import json
2+
import logging
3+
from typing import Tuple
4+
5+
6+
def get_json(json_file_path: str) -> dict:
7+
"""
8+
Load json content from a given path
9+
"""
10+
try:
11+
with open(json_file_path, 'r') as fd:
12+
return json.load(fd)
13+
except json.JSONDecodeError:
14+
logging.exception(f'Invalid JSON format for pipeline input: {json_file_path}')
15+
except IOError:
16+
logging.exception(f'Can not open pipeline input: {json_file_path}')
17+
18+
19+
def extract_data_from_inputs(json_input_file: str) -> Tuple[int, str]:
20+
"""
21+
Read input file then extract pipeline data as list of tuples
22+
"""
23+
json_content = get_json(json_input_file)
24+
25+
extracted_data = [(review['ratingOverall'], review['segments']) for review in json_content]
26+
27+
return extracted_data
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import json
2+
from os import makedirs
3+
from os.path import dirname
4+
from typing import List
5+
6+
7+
def write_json(json_file: str, data: dict):
8+
"""
9+
Create parent directories if not exist.
10+
Write the json file.
11+
"""
12+
makedirs(dirname(json_file), exist_ok=True)
13+
with open(json_file, 'w') as fd:
14+
json.dump(data, fd)
15+
16+
17+
def write_lines_file(file_path: str, data_list: List[str]):
18+
"""
19+
Create parent directories if not exist.
20+
Write the file line by line.
21+
"""
22+
makedirs(dirname(file_path), exist_ok=True)
23+
with open(file_path, 'w') as fd:
24+
fd.writelines(['{}{}'.format(line, '' if line.endswith('\n') else '\n') for line in data_list])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from typing import Tuple, List
2+
3+
from nltk import wordpunct_tokenize
4+
5+
6+
def tokenize_and_clean_text(text: str) -> str:
7+
return ' '.join([token.lower() for token in wordpunct_tokenize(text)
8+
if token.isalpha() and token.lower()])
9+
10+
11+
def clean_formatting(text: List[str]) -> str:
12+
return tokenize_and_clean_text(' '.join(text))
13+
14+
15+
def preprocess_data(extracted_data: List[Tuple[str, str]]) -> List[str]:
16+
"""
17+
Transform data to get compliant with fasttext expected
18+
format: __label__[label] [text]
19+
"""
20+
return [f'__label__{data[0]} {clean_formatting(data[1])}' for data in extracted_data]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import random
2+
from typing import List, Tuple
3+
4+
5+
def split_dataset(fasttext_data_set: List[str], test_percent: float) -> Tuple[List[str], List[str]]:
6+
"""
7+
Shuffle and split the input data set into a train and a test set
8+
according to the test_percent.
9+
:param fasttext_data_set: data set on fast text format
10+
:param test_percent: percent of test data (ex: 0.10)
11+
:return: test fasttext data set, train fasttext data set
12+
"""
13+
random.shuffle(fasttext_data_set)
14+
split_idx = round(test_percent * len(fasttext_data_set))
15+
return fasttext_data_set[0: split_idx], fasttext_data_set[split_idx:]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"model_path = '../data/model/classifier.bin'\n",
10+
"dataset_path = '../data/intermediate/test_dataset.txt'"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"with open(dataset_path, 'r') as fd:\n",
20+
" test_data_lines = fd.readlines()\n",
21+
"test_data_lines"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": null,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"import fasttext as ft\n",
31+
"\n",
32+
"model = ft.load_model(model_path)\n",
33+
"result = model.test(dataset_path)"
34+
]
35+
},
36+
{
37+
"cell_type": "code",
38+
"execution_count": null,
39+
"metadata": {},
40+
"outputs": [],
41+
"source": [
42+
"metrics = [\n",
43+
" f'Precision@1: {result.precision}',\n",
44+
" f'Recall@1: {result.recall}',\n",
45+
" f'Nb review: {result.nexamples}'\n",
46+
"]"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": null,
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"for line in metrics:\n",
56+
" print(line)"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"from classifier.helper import write_lines_file\n",
66+
"\n",
67+
"write_lines_file('../data/result/metrics.txt', metrics)"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": []
76+
}
77+
],
78+
"metadata": {
79+
"kernelspec": {
80+
"display_name": "Python 3",
81+
"language": "python",
82+
"name": "python3"
83+
},
84+
"language_info": {
85+
"codemirror_mode": {
86+
"name": "ipython",
87+
"version": 3
88+
},
89+
"file_extension": ".py",
90+
"mimetype": "text/x-python",
91+
"name": "python",
92+
"nbconvert_exporter": "python",
93+
"pygments_lexer": "ipython3",
94+
"version": "3.6.5"
95+
}
96+
},
97+
"nbformat": 4,
98+
"nbformat_minor": 2
99+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"reviews_path = '../data/input/trip_advisor.json'"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import json\n",
19+
"with open(reviews_path) as fd:\n",
20+
" data = json.load(fd)\n",
21+
"data"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": null,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"len(data)"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": null,
36+
"metadata": {},
37+
"outputs": [],
38+
"source": [
39+
"from classifier.extract import extract_data_from_inputs"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"extracted_data = extract_data_from_inputs(reviews_path)"
49+
]
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": null,
54+
"metadata": {},
55+
"outputs": [],
56+
"source": [
57+
"extracted_data"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": null,
63+
"metadata": {},
64+
"outputs": [],
65+
"source": [
66+
"len(extracted_data)"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": null,
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"from classifier.helper import write_json\n",
76+
"write_json('../data/intermediate/extracted_data.json', extracted_data)"
77+
]
78+
},
79+
{
80+
"cell_type": "code",
81+
"execution_count": null,
82+
"metadata": {},
83+
"outputs": [],
84+
"source": []
85+
}
86+
],
87+
"metadata": {
88+
"kernelspec": {
89+
"display_name": "Python 3",
90+
"language": "python",
91+
"name": "python3"
92+
},
93+
"language_info": {
94+
"codemirror_mode": {
95+
"name": "ipython",
96+
"version": 3
97+
},
98+
"file_extension": ".py",
99+
"mimetype": "text/x-python",
100+
"name": "python",
101+
"nbconvert_exporter": "python",
102+
"pygments_lexer": "ipython3",
103+
"version": "3.6.5"
104+
}
105+
},
106+
"nbformat": 4,
107+
"nbformat_minor": 2
108+
}

0 commit comments

Comments
 (0)