-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexamples.py
115 lines (91 loc) · 4.63 KB
/
examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# %%
from dhs_scraper import DhsArticle, stream_to_jsonl
# %%
# get an article from its unique triple identifier: language, dhs id and version.
# If no version given, it is assumed to be the latest one
# Here, this corresponds to the page: https://hls-dhs-dss.ch/fr/articles/029462/2016-11-23/
schneckenbundgericht = DhsArticle("fr", "029462", "2016-11-23")
schneckenbundgericht.language
schneckenbundgericht.id
schneckenbundgericht.version
# %%
# possible to create it directly from url too.
georges = DhsArticle(url="https://hls-dhs-dss.ch/fr/articles/044820/2011-12-08/")
# %%
# A DhsArticle initially contains only its language, id, version data
# to load the actual content of the article, use parse_article()
schneckenbundgericht.parse_article()
schneckenbundgericht.page_content # whole html page content obtained by a request to the article's url, can be dropped immediatly by adding drop_page=True argument to parse_article()
schneckenbundgericht.title # title of the article
schneckenbundgericht.text_blocks # text blocks of the article, with their corresponding html tag
schneckenbundgericht.text # text of the article, text blocks concatenated with "\n\n"
schneckenbundgericht.text_links # links contained in the article text, organized per text element, see parse_text_links() doc
schneckenbundgericht.bref # list of elements in the "En bref"/"Kurzinformationen"/"Scheda informativa" section of an article
schneckenbundgericht.authors_translators # authors/translators of the article
schneckenbundgericht.sources # sources from "Sources et bibliographie"/"Quellen und Literatur"/"Riferimenti bibliografici" section
schneckenbundgericht.metagrid_id # id on the metagrid network, see https://metagrid.ch/
schneckenbundgericht.metagrid_links # links from metagrid to other databases
schneckenbundgericht.notice_links # links from section "Notices d'autorité"/"Normdateien"/"Controllo di autorità"
schneckenbundgericht.tags # internal DHS links from section "Indexation thématique"/"Systematik"/"Classificazione"
schneckenbundgericht.initial # initial of the article subject used in the article text (in article text "Zurich" is referred to with "Z." in text), can be None
# %%
# articles about people have extra fields that get parsed:
# - given name and family name (from title)
# - birth and death date (from bref section)
georges.parse_article()
if georges.is_person():
georges.given_name
georges.family_name
georges.birth_date
georges.death_date
# Things that aren't parsed but we would like to add:
# - in-text links
# - italic text
# - data tables
# - images/media and their captions
# - section titles
# %%
# Do a naïve search in the DHS, here for "bronschhofen".
# You can give a single string or a list of strings
bronschhofen_articles_search = list(DhsArticle.search_for_articles(
"bronschofen"
))
# %%
# Loading 13 articles from a search url (here: all ecclesiastic entries)
# # do not forget the &firstIndex= ending for the url
ecclesiastic_entries = list(DhsArticle.scrape_articles_from_search_url(
"https://hls-dhs-dss.ch/fr/search/category?text=*&sort=score&sortOrder=desc&collapsed=true&r=1&rows=20&firstIndex=0&f_hls.lexicofacet_string=1%2F006800.009500.&firstIndex=",
max_nb_articles=13
))
# %%
# Download and parse all the article's elements for the bronschhofen articles
for a in bronschhofen_articles_search:
a.parse_article()
# %%
# Stream search results to a jsonl file
instruments_craftsmen_file = "instruments_craftsmen.jsonl"
instruments_craftsmen = stream_to_jsonl(instruments_craftsmen_file,DhsArticle.scrape_articles_from_search_url(
"https://hls-dhs-dss.ch/fr/search/category?text=*&sort=score&sortOrder=desc&collapsed=true&r=1&rows=20&f_hls.lexicofacet_string=3%2F000100.132500.134600.135000.&firstIndex="
))
# Load articles back from a jsonl file
jazzpeople_file = "jazzpeople.jsonl"
jazzpeople = DhsArticle.load_articles_from_jsonl(jazzpeople_file)
# %%
# Scrape the whole french DHS and stream the articles on-the-fly to a jsonl file
# If the output jsonl file already contains some articles, makes sure no duplicates are taken
# The `jsonl_articles_content_file` file must already exist.
if False:
language="fr"
jsonl_articles_content_file = f"dhs_all_articles_{language}.jsonl"
already_visited_ids_content = set(DhsArticle.get_articles_ids(jsonl_articles_content_file))
stream_to_jsonl(
jsonl_articles_content_file,
DhsArticle.scrape_all_articles(
language=language,
parse_articles = False,
force_language = language,
skip_duplicates = True,
already_visited_ids = already_visited_ids_content
),
buffer_size=100
)