diff --git a/data/datasets/bart_searchgpt_wiki_nlp_augment/3_10k_bart_trial.py b/data/datasets/bart_searchgpt_wiki_nlp_augment/3_10k_bart_trial.py index 12d539a2b7..d8afe37968 100644 --- a/data/datasets/bart_searchgpt_wiki_nlp_augment/3_10k_bart_trial.py +++ b/data/datasets/bart_searchgpt_wiki_nlp_augment/3_10k_bart_trial.py @@ -12,6 +12,7 @@ def num_tokens_from_string(string: str) -> int: if __name__ == "__main__": + # Use the `dtype` parameter of `pd.read_csv`. sampled_df = pd.read_csv("wiki_qa_bart_10000row_input.csv") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) diff --git a/data/datasets/bart_searchgpt_wiki_nlp_augment/4_convert_to_oa_format.py b/data/datasets/bart_searchgpt_wiki_nlp_augment/4_convert_to_oa_format.py index b59bde8091..a517bef579 100644 --- a/data/datasets/bart_searchgpt_wiki_nlp_augment/4_convert_to_oa_format.py +++ b/data/datasets/bart_searchgpt_wiki_nlp_augment/4_convert_to_oa_format.py @@ -3,6 +3,7 @@ import pandas as pd if __name__ == "__main__": + # Use the `dtype` parameter of `pd.read_csv`. raw_df = pd.read_csv(r"...\wiki_qa_bart_10000row.csv") # print(raw_df.iloc[0]) # print(raw_df.columns) diff --git a/data/datasets/mt_note_generation/prepare.py b/data/datasets/mt_note_generation/prepare.py index ff5fcdc5a1..4021c587f9 100644 --- a/data/datasets/mt_note_generation/prepare.py +++ b/data/datasets/mt_note_generation/prepare.py @@ -63,6 +63,7 @@ def main(output_dir: str = "data"): """Download and prepare the dataset for use.""" os.makedirs(output_dir, exist_ok=True) kaggle.api.dataset_download_files("tboyle10/medicaltranscriptions", "data", unzip=True) + # Use the `dtype` parameter of `pd.read_csv`. mt_samples = preprocess(pd.read_csv("data/mtsamples.csv")) conversations = get_conversations(mt_samples) random.shuffle(conversations) diff --git a/data/datasets/poetry_instruction/prepare.py b/data/datasets/poetry_instruction/prepare.py index 9a4718e2da..5667e4ba88 100644 --- a/data/datasets/poetry_instruction/prepare.py +++ b/data/datasets/poetry_instruction/prepare.py @@ -14,6 +14,7 @@ # Read the CSV file into a pandas dataframe csv_file = os.path.join(download_path, "PoetryFoundationData.csv") +# Use the `dtype` parameter of `pd.read_csv`. df = pd.read_csv(csv_file) # The data in the CSV file is not formatted correctly, so we need to clean it up. diff --git a/data/datasets/zhihu-kol/convert_parquet.py b/data/datasets/zhihu-kol/convert_parquet.py index 5d8c7b6971..a89354a7a9 100644 --- a/data/datasets/zhihu-kol/convert_parquet.py +++ b/data/datasets/zhihu-kol/convert_parquet.py @@ -43,6 +43,7 @@ def reformat_csv_to_openassistant(df: pd.DataFrame) -> pd.DataFrame: if __name__ == "__main__": input_csv = "zhihu.csv" # Create a pandas dataframe from your dataset file(s) + # Use the `dtype` parameter of `pd.read_csv`. df = pd.read_csv(input_csv) # or any other way df = reformat_csv_to_openassistant(df) # Save the file in the Parquet format diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py index 79072006ce..1e9e575fc2 100644 --- a/scripts/data_augment/data_augment.py +++ b/scripts/data_augment/data_augment.py @@ -458,6 +458,7 @@ def parse_arguments(): def read_data(args): + # Use the `dtype` parameter of `pd.read_csv`. files = pd.read_csv(args.dataset, sep=",", header=None, names=["file"]) files = files["file"].tolist() data = []