peopledoc
diff --git a/‎.gitignore
+3-1 b/‎.gitignore
+3-1
diff --git a/‎resources/setup_project/data/input/conf.json
+4 b/‎resources/setup_project/data/input/conf.json
+4
diff --git a/‎resources/setup_project/data/input/trip_advisor.json
+1 b/‎resources/setup_project/data/input/trip_advisor.json
+1
diff --git a/‎resources/setup_project/docker/Dockerfile
+12 b/‎resources/setup_project/docker/Dockerfile
+12
diff --git a/‎resources/setup_project/docker/run.sh
+6 b/‎resources/setup_project/docker/run.sh
+6
diff --git a/‎resources/setup_project/project/.gitignore
+6 b/‎resources/setup_project/project/.gitignore
+6
diff --git a/‎resources/setup_project/project/Makefile
+13 b/‎resources/setup_project/project/Makefile
+13
diff --git a/‎resources/setup_project/project/classifier/__init__.py b/‎resources/setup_project/project/classifier/__init__.py
diff --git a/‎resources/setup_project/project/classifier/extract.py
+27 b/‎resources/setup_project/project/classifier/extract.py
+27
diff --git a/‎resources/setup_project/project/classifier/helper.py
+24 b/‎resources/setup_project/project/classifier/helper.py
+24
diff --git a/‎resources/setup_project/project/classifier/pre_process.py
+20 b/‎resources/setup_project/project/classifier/pre_process.py
+20
diff --git a/‎resources/setup_project/project/classifier/split.py
+15 b/‎resources/setup_project/project/classifier/split.py
+15
diff --git a/‎resources/setup_project/project/notebooks/evaluate_model.ipynb
+99 b/‎resources/setup_project/project/notebooks/evaluate_model.ipynb
+99
diff --git a/‎resources/setup_project/project/notebooks/extract_data.ipynb
+108 b/‎resources/setup_project/project/notebooks/extract_data.ipynb
+108
@@ -2,4 +2,6 @@
 **/.ipynb_checkpoints/*
 __pycache__/
 data/*
-
+*.idea
+*.pytest_cache
+venv
@@ -0,0 +1,4 @@
+{
+  "epoch": 20,
+  "learning_rate": 0.7
+}
@@ -0,0 +1,12 @@
+FROM python:3.6
+
+RUN apt-get update && apt-get install -y tree \
+                                         nano \
+                                         vim \
+                                         virtualenv \
+                                         python3-dev
+
+RUN git config --global user.name tuto_user
+RUN git config --global user.email tuto_user@example.com
+
+WORKDIR /tuto
@@ -0,0 +1,6 @@
+#!/bin/bash
+IMG_NAME="setup_project_tuto"
+
+docker build -t $IMG_NAME $(dirname $0)
+
+docker run -v $(git rev-parse --show-toplevel):/tuto -it $IMG_NAME bash
@@ -0,0 +1,6 @@
+venv
+*.egg-info
+__pycache__/
+**.ipynb_checkpoints
+*.pytest_cache
+*.idea
@@ -0,0 +1,13 @@
+.PHONY: help setup
+
+#: help - Display callable targets.
+help:
+	@echo "Reference card for usual actions in development environment."
+	@echo "Here are available targets:"
+	@egrep -o "^#: (.+)" [Mm]akefile  | sed 's/#: /* /'
+
+
+#: setup - Install dependencies.
+setup:
+	pip install cython
+	pip install -e . -r ./requirements.txt
@@ -0,0 +1,27 @@
+import json
+import logging
+from typing import Tuple
+
+
+def get_json(json_file_path: str) -> dict:
+    """
+        Load json content from a given path
+    """
+    try:
+        with open(json_file_path, 'r') as fd:
+            return json.load(fd)
+    except json.JSONDecodeError:
+        logging.exception(f'Invalid JSON format for pipeline input: {json_file_path}')
+    except IOError:
+        logging.exception(f'Can not open pipeline input: {json_file_path}')
+
+
+def extract_data_from_inputs(json_input_file: str) -> Tuple[int, str]:
+    """
+        Read input file then extract pipeline data as list of tuples
+    """
+    json_content = get_json(json_input_file)
+
+    extracted_data = [(review['ratingOverall'], review['segments']) for review in json_content]
+
+    return extracted_data
@@ -0,0 +1,24 @@
+import json
+from os import makedirs
+from os.path import dirname
+from typing import List
+
+
+def write_json(json_file: str, data: dict):
+    """
+        Create parent directories if not exist.
+        Write the json file.
+    """
+    makedirs(dirname(json_file), exist_ok=True)
+    with open(json_file, 'w') as fd:
+        json.dump(data, fd)
+
+
+def write_lines_file(file_path: str, data_list: List[str]):
+    """
+        Create parent directories if not exist.
+        Write the file line by line.
+    """
+    makedirs(dirname(file_path), exist_ok=True)
+    with open(file_path, 'w') as fd:
+        fd.writelines(['{}{}'.format(line, '' if line.endswith('\n') else '\n') for line in data_list])
@@ -0,0 +1,20 @@
+from typing import Tuple, List
+
+from nltk import wordpunct_tokenize
+
+
+def tokenize_and_clean_text(text: str) -> str:
+    return ' '.join([token.lower() for token in wordpunct_tokenize(text)
+                     if token.isalpha() and token.lower()])
+
+
+def clean_formatting(text: List[str]) -> str:
+    return tokenize_and_clean_text(' '.join(text))
+
+
+def preprocess_data(extracted_data: List[Tuple[str, str]]) -> List[str]:
+    """
+        Transform data to get compliant with fasttext expected
+        format:  __label__[label] [text]
+    """
+    return [f'__label__{data[0]} {clean_formatting(data[1])}' for data in extracted_data]
@@ -0,0 +1,15 @@
+import random
+from typing import List, Tuple
+
+
+def split_dataset(fasttext_data_set: List[str], test_percent: float) -> Tuple[List[str], List[str]]:
+    """
+        Shuffle and split the input data set into a train and a test set
+        according to the test_percent.
+    :param fasttext_data_set: data set on fast text format
+    :param test_percent:  percent of test data (ex: 0.10)
+    :return: test fasttext data set, train fasttext data set
+    """
+    random.shuffle(fasttext_data_set)
+    split_idx = round(test_percent * len(fasttext_data_set))
+    return fasttext_data_set[0: split_idx], fasttext_data_set[split_idx:]
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = '../data/model/classifier.bin'\n",
+    "dataset_path = '../data/intermediate/test_dataset.txt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(dataset_path, 'r') as fd:\n",
+    "        test_data_lines = fd.readlines()\n",
+    "test_data_lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import fasttext as ft\n",
+    "\n",
+    "model = ft.load_model(model_path)\n",
+    "result = model.test(dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metrics = [\n",
+    "    f'Precision@1: {result.precision}',\n",
+    "    f'Recall@1: {result.recall}',\n",
+    "    f'Nb review: {result.nexamples}'\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for line in metrics:\n",
+    "    print(line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from classifier.helper import write_lines_file\n",
+    "\n",
+    "write_lines_file('../data/result/metrics.txt', metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reviews_path = '../data/input/trip_advisor.json'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open(reviews_path) as fd:\n",
+    "    data = json.load(fd)\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from classifier.extract import extract_data_from_inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted_data = extract_data_from_inputs(reviews_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(extracted_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from classifier.helper import write_json\n",
+    "write_json('../data/intermediate/extracted_data.json', extracted_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}