forked from iSchool-597PR/2021Fall_finals
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfinal_project.py
1141 lines (985 loc) · 55.2 KB
/
final_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
IS597 Spring 2021 Final Project
Group members: Kangyang Wang, Wendy Zhu, and Kay Avila
Reads in event data about pandemics and wars, as well as data from the United States Consumer Price Index (CPI),
Gross Domestic Product (GPD), and stock market S&P500 and Dow Jones. Then creates a series of plots combining
these data sets.
"""
import numpy as np
import pandas as pd
from datetime import date
from typing import Union, Literal
import matplotlib.pyplot as plt
def min_max_year_checking(min_year: int = None, min_year_possible: int = None, max_year: Union[int, None] = None,
max_year_possible: int = None) -> None:
""" Throws an error if a given minimum year is less than a minimum year possible, maxinum year is greater than a
maximum year possible (or the current year, if none given), or if the minimum year is greater than the maximum year.
:param min_year: The minimum year value to check
:param min_year_possible: The lowest possible year
:param max_year: The maximum year value to check
:param max_year_possible: The maximum possible year
:return: None but raises a ValueError if one of the criteria isn't met
>>> min_max_year_checking(min_year=2000, max_year=1980) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid years: Minimum year must be less than maximum year.
>>> min_max_year_checking(min_year=1910, min_year_possible=1960) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid minimum year: Minimum year cannot be less than 1960.
>>> min_max_year_checking(min_year=3000) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid minimum year: Minimum year cannot be greater than 2021.
>>> min_max_year_checking(max_year=3000) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid maximum year: Maximum year cannot be greater than 2021.
>>> min_max_year_checking(min_year=1950, max_year=1930) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid years: Minimum year must be less than maximum year.
"""
curr_year = date.today().year
if min_year:
if min_year_possible and min_year < min_year_possible:
raise ValueError('Invalid minimum year: Minimum year cannot be less than {}.'.format(min_year_possible))
elif min_year > curr_year:
raise ValueError('Invalid minimum year: Minimum year cannot be greater than {}.'.format(curr_year))
if max_year:
# Set maximum year to the current year if none given
if not max_year_possible:
max_year_possible = curr_year
if max_year > max_year_possible:
raise ValueError('Invalid maximum year: Maximum year cannot be greater than {}.'.format(max_year_possible))
# Double-check we haven't been given something invalid for years
if (min_year and max_year) and (min_year > max_year):
raise ValueError('Invalid years: Minimum year must be less than maximum year.')
def read_event_facts(filename: str, types: Union[str, list] = None, ranges: Union[str, list] = None,
min_start_year: Union[int, None] = None, max_start_year: Union[int, None] = None,
min_end_year: Union[int, None] = None, max_end_year: Union[int, None] = None) -> pd.DataFrame:
""" Reads in a csv file of events with the below format and optionally filters it by years, types, and ranges.
Converts it into a Pandas dataframe and returns it after calculating a Duration column
Event_Name Type Range Start_Year End_Year Fatalities
Spanish Flu Pandemics Affect United States 1918 1920 >100m
Asian Flu Pandemics Affect United States 1957 1958 1-10m
Hong Kong Flu Pandemics Affect United States 1968 1970 1-10m
London flu Pandemics Affect United States 1972 1973 <10,000
:param filename: The csv file to read in
:param types: The types of events
:param ranges: The geographical range impacted
:param min_start_year: Minimum start year to filter by
:param max_start_year: Maximum start year to filter by
:param min_end_year: Minimum end year to filter by
:param max_end_year: Maximum end year to filter by
:return: A pandas dataframe from the events file
>>> read_event_facts('test.txt') # doctest: +ELLIPSIS
Traceback (most recent call last):
...
FileNotFoundError: [Errno 2] No such file or directory: 'test.txt'
>>> read_event_facts('data/event_facts.csv', min_start_year=2022) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid start year value(s): Invalid minimum year: Minimum year cannot be greater than 2021.
>>> read_event_facts('data/event_facts.csv', min_end_year=1950, max_end_year=1930) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid end year value(s): Invalid years: Minimum year must be less than maximum year.
>>> read_event_facts('data/event_facts.csv', types='Historical Events')
Traceback (most recent call last):
...
ValueError: Invalid type(s) given: Historical Events. Valid type(s): Pandemics, War
>>> read_event_facts('data/event_facts.csv', ranges='South America')
Traceback (most recent call last):
...
ValueError: Invalid range(s) given: South America. Valid ranges(s): Affect United States
>>> df = read_event_facts('data/event_facts.csv', ranges='Affect United States', types=['War'])
>>> df
Event_Name Type ... Fatalities Duration
11 World War I War ... 10-100m 4
12 Korean War War ... 1-10m 3
13 Vietnam War War ... 1-10m 10
14 World War II War ... >100m 6
15 Gulf War War ... 10,000-1m 1
16 Civil war in Afghanistan War ... 10,000-1m 5
17 War on Terror War ... 10,000-1m 20
18 Iraq War War ... 10,000-1m 8
19 War in Somalia War ... 1-10m 15
<BLANKLINE>
[9 rows x 7 columns]
>>> df = read_event_facts('data/event_facts.csv', min_start_year=1900, max_end_year=1930)
>>> df[['Event_Name', 'Start_Year', 'End_Year']]
Event_Name Start_Year End_Year
0 Diphtheria epidemic 1921 1925
2 Spanish Flu (H1N1) 1918 1920
11 World War I 1914 1918
>>> df = read_event_facts('data/event_facts.csv', max_start_year=1990, min_end_year=1991)
>>> df[['Event_Name', 'Start_Year', 'End_Year']]
Event_Name Start_Year End_Year
6 Second measles outbreak 1981 1991
7 HIV AIDS pandemic 1981 2021
15 Gulf War 1990 1991
"""
# Raise an error if one of the optional year parameters given is invalid
try:
min_max_year_checking(min_year=min_start_year, max_year=max_start_year)
except ValueError as e:
raise ValueError('Invalid start year value(s): {}'.format(str(e)))
try:
min_max_year_checking(min_year=min_end_year, max_year=max_end_year)
except ValueError as e:
raise ValueError('Invalid end year value(s): {}'.format(str(e)))
df = pd.read_csv(filename, usecols=['Event_Name', 'Type', 'Range', 'Start_Year', 'End_Year', 'Fatalities'],
dtype={'Event_Name': 'string', 'Type': 'string', 'Range': 'string', 'Start_Year': 'int16',
'End_Year': 'int16', 'Fatalities': 'string'})
# If types were given, convert to a list and error if invalid ones are given
if types:
if isinstance(types, str):
types = [types]
# Find any given types that are not actually present in the dataset
valid_types = list(df['Type'].unique())
invalid_types = np.setdiff1d(types, valid_types)
if len(invalid_types) > 0:
raise ValueError('Invalid type(s) given: ' + ', '.join(invalid_types) + '. Valid type(s): ' +
', '.join(valid_types))
relevant_types = df['Type'].isin(types) # Filter down to the ones we want
df = df[relevant_types]
# If ranges are given, convert to a list and error if invalid ones are given
if ranges:
if isinstance(ranges, str):
ranges = [ranges]
# Find any given types that are not actually present in the dataset
valid_ranges = list(df['Range'].unique())
invalid_ranges = np.setdiff1d(ranges, valid_ranges)
if len(invalid_ranges) > 0:
raise ValueError('Invalid range(s) given: ' + ', '.join(invalid_ranges) + '. Valid ranges(s): ' +
', '.join(valid_ranges))
relevant_ranges = df['Range'].isin(ranges) # Filter down to the ones we want
df = df[relevant_ranges]
# Filter by starting year if applicable
if min_start_year or max_start_year:
# Find the minimum and maximum years if one of these wasn't given as a parameter
if not min_start_year:
min_start_year = int(df['Start_Year'].min())
elif not max_start_year:
max_start_year = int(df['Start_Year'].max())
df = df[df['Start_Year'] >= min_start_year]
df = df[df['Start_Year'] <= max_start_year]
# Filter by ending year if applicable
if min_end_year or max_end_year:
# Find the minimum and maximum years if one of these wasn't given as a parameter
if not min_end_year:
min_end_year = int(df['End_Year'].min())
elif not max_end_year:
max_end_year = int(df['End_Year'].max())
df = df[df['End_Year'] >= min_end_year]
df = df[df['End_Year'] <= max_end_year]
# Add duration column
df["Duration"] = df["End_Year"] - df["Start_Year"]
return df
def add_time_range(e_df: pd.DataFrame, t0: Literal['start_year', 'end_year', 'year_before_end_year',
'year_after_start_year'], length: int,
add_extra_yr_before: bool = False) -> pd.DataFrame:
"""
This function is used to add two columns ("y_start", "y_end") into the event_fact dataframe based on the selection
of year t0 and the number of years before and after year t0 for further plotting and study.
:param e_df: the pd.DataFrame storing the event facts
:param t0: the specific year used as the zero point in selecting the time range, the value of t0 could be one of
["start year", "end year", "the year before end year", "the year after start year"]
:param length: the number of years to study before and after t0
:param add_extra_yr_before: Indicates whether an extra year should be added before the beginning
(to account for pct_change)
:return: the event facts dataframe with two extra columns storing the start year and end year for further study
>>> df = pd.DataFrame({'Events': ['Event A', 'Event B', 'Event C'], 'Start_Year': [1950, 1999, 2001], \
'End_Year': [1950, 2000, 2010]})
>>> add_time_range(df, 'invalid_input', 5) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: y0 must be one of start_year, end_year, year_before_end_year, year_after_start_year
>>> result = add_time_range(df, 'start_year', 5, add_extra_yr_before=True)
>>> result.head()
Events Start_Year End_Year y_start y_end
0 Event A 1950 1950 1944 1955
1 Event B 1999 2000 1993 2004
2 Event C 2001 2010 1995 2006
>>> result = add_time_range(df, 'end_year', 5)
>>> result.head()
Events Start_Year End_Year y_start y_end
0 Event A 1950 1950 1945 1955
1 Event B 1999 2000 1995 2005
2 Event C 2001 2010 2005 2015
>>> result = add_time_range(df, 'year_after_start_year', 2)
>>> result.head()
Events Start_Year End_Year y_start y_end
0 Event A 1950 1950 1949 1953
1 Event B 1999 2000 1998 2002
2 Event C 2001 2010 2000 2004
>>> result = add_time_range(df, 'year_before_end_year', 0)
>>> result.head()
Events Start_Year End_Year y_start y_end
0 Event A 1950 1950 1949 1949
1 Event B 1999 2000 1999 1999
2 Event C 2001 2010 2009 2009
"""
zero_points = ["start_year", "end_year", "year_before_end_year", "year_after_start_year"]
# Select starting year (y0) based on the t0 parameter
if t0 == zero_points[0]:
y0 = e_df["Start_Year"]
elif t0 == zero_points[1]:
y0 = e_df["End_Year"]
elif t0 == zero_points[2]:
y0 = e_df["End_Year"] - 1
elif t0 == zero_points[3]:
y0 = e_df["Start_Year"] + 1
else:
raise ValueError('y0 must be one of ' + ', '.join(zero_points))
if add_extra_yr_before:
e_df["y_start"] = y0 - length - 1
else:
e_df["y_start"] = y0 - length
e_df["y_end"] = y0 + length
e_df.astype({'y_start': 'int16', 'y_end': 'int16'})
return e_df
def trim_to_years(df: pd.DataFrame, start_year: int, end_year: int, year_col_name: str = 'Year',
pad: Literal[None, 'zero', 'nan'] = None, pad_col_name: Union[None, str] = None) -> pd.DataFrame:
""" Given a dataframe with a year column, filters the dataframe down to the range created by the starting
and ending years specified as paramters. If a pad option is specified, then years for which data does not exist
are padded with the value corresponding to the option.
:param df: A dataframe with some type of year column
:param year_col_name: The name of the column holding the years (defaults to 'Year')
:param start_year: The beginning of the years to include
:param end_year: The last year to include
:param pad: Value to insert for years that have no data
:param pad_col_name: If pad is specified, the value column in which to add padding
:return: An updated dataframe with only the years between the two specified, with optional padding
>>> df = pd.DataFrame({'Years': list(range(1990, 2001)), 'Values': list(range(0, 21, 2))})
>>> trim_to_years(df, 2000, 1999, 'Years') # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Invalid years: Start year must be less than end year.
>>> trim_to_years(df, 1995, 2000, pad='none') # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: If pad is specified, pad_col_name must be given.
>>> trim_to_years(df, 1995, 2000, pad='fill', pad_col_name='Value') # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Value for pad must be one of: zero, nan
>>> results = trim_to_years(df, 1995, 2020, 'Years')
>>> print(results)
Years Values
5 1995 10
6 1996 12
7 1997 14
8 1998 16
9 1999 18
10 2000 20
>>> results = trim_to_years(df, 1900, 1910, 'Years')
>>> len(results)
0
>>> results = trim_to_years(df, 1989, 1992, 'Years', pad='zero', pad_col_name='Values')
>>> print(results)
Years Values
0 1989 0
1 1990 0
2 1991 2
3 1992 4
>>> results = trim_to_years(df, 1988, 2003, 'Years', pad='nan', pad_col_name='Values')
>>> results.tail()
Years Values
11 1999 18.0
12 2000 20.0
13 2001 NaN
14 2002 NaN
15 2003 NaN
>>> results.head()
Years Values
0 1988 NaN
1 1989 NaN
2 1990 0.0
3 1991 2.0
4 1992 4.0
"""
# Don't use min_max_year_checking() because we want to allow invalid years (for padding)
if start_year > end_year:
raise ValueError('Invalid years: Start year must be less than end year.')
pad_options = {'zero': 0, 'nan': np.NaN}
if pad:
# Raise an error if pad is specified but no value column to pad
if not pad_col_name:
raise ValueError('If pad is specified, pad_col_name must be given.')
if pad not in pad_options.keys():
raise ValueError('Value for pad must be one of: ' + ', '.join(pad_options))
else:
pad_value = pad_options[pad]
pad = True
df_selected = df.loc[df[year_col_name] >= start_year]
df_selected = df_selected[df_selected[year_col_name] <= end_year]
if pad:
# Given from above that both of these values are filled
assert pad_value in pad_options.values()
assert pad_col_name is not None
# Find the first and last years for which there is data available
min_year_possible = df[year_col_name].min()
max_year_possible = df[year_col_name].max()
if start_year < min_year_possible:
start_pad_df = pd.DataFrame({year_col_name: list(range(start_year, min_year_possible)),
pad_col_name: [pad_value for _ in range(start_year, min_year_possible)]})
df_selected = pd.concat([start_pad_df, df_selected])
if end_year > max_year_possible:
# Slightly different from start/min, because we need to add values beginning with the next year after the
# maximum year available, and pad through the end_year specified as a parameter
end_pad_df = pd.DataFrame({year_col_name: list(range(max_year_possible + 1, end_year + 1)),
pad_col_name: [pad_value for _ in range(max_year_possible + 1, end_year + 1)]})
df_selected = pd.concat([df_selected, end_pad_df])
# Reset the index so all index values are unique
df_selected.reset_index(inplace=True, drop=True)
return df_selected
def add_mean_and_quartiles(df: pd.DataFrame) -> pd.DataFrame:
"""
Takes a dataframe with a series of values in columns and adds new columns for the median and quartiles.
:param df: The dataframe with values in columns
:return: An updated dataframe with 25pct, median, and 75pct columns
>>> df = pd.DataFrame({'Col1': [0, 1, 2, 3], 'Col2': [1, 1, 1, 1], 'Col3': [1, 2, 4, 8]})
>>> print(add_mean_and_quartiles(df))
Col1 Col2 Col3 mean median 25pct 75pct
0 0 1 1 0.666667 1.0 0.5 1.0
1 1 1 2 1.333333 1.0 1.0 1.5
2 2 1 4 2.333333 2.0 1.5 3.0
3 3 1 8 4.000000 3.0 2.0 5.5
"""
# Get statistics on the df, then grab the appropriate columns from them
stats_df = df.apply(pd.DataFrame.describe, axis=1)
df['mean'] = stats_df['mean']
df['median'] = stats_df['50%'] # median is equivalent to 50% percentile
df['25pct'] = stats_df['25%']
df['75pct'] = stats_df['75%']
return df
def adjust_index(df: pd.DataFrame) -> None:
""" Takes a dataframe with regular range index and updates it so that it goes from negative numbers from zero,
to zero, to numbers past zero. Only works with an odd number of rows.
:param df: The dataframe whose index should be adjusted
:return: None
>>> df = pd.DataFrame().from_dict([x for x in range(0, 6)])
>>> adjust_index(df) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: Dataframe provided must have an odd number of rows.
>>> df = pd.DataFrame().from_dict([x for x in range(1, 6)])
>>> adjust_index(df)
>>> print(df)
0
-2 1
-1 2
0 3
1 4
2 5
"""
current_index = df.index
# This only works for an odd number of rows - throw an error if even
if len(current_index) % 2 == 0:
raise ValueError('Dataframe provided must have an odd number of rows.')
range_from_zero = int(current_index.stop/2 - .5)
df.index = pd.RangeIndex(start=range_from_zero*-1, stop=range_from_zero+1)
def read_worlddb_gdp(filename: str, min_year: Union[int, None] = None, max_year: Union[int, None] = None,
countries: Union[list, None] = None) -> pd.DataFrame:
""" Takes a csv file from the World Data Bank containing GDP information from various countries. Original data
can be obtained from https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=NY.GDP.MKTP.CD#
:param countries: A list of countries to include, using their country codes
:param min_year: The starting year to include data from (otherwise data from the beginning to max_year is returned)
:param max_year: The ending year to include data from (otherwise data from min_year to current is returned)
:param filename: The World Data Bank csv file of GDP data
:return: A pandas dataframe containing the GDP data for the countries and years specified
>>> read_worlddb_gdp('test.txt') # doctest: +ELLIPSIS
Traceback (most recent call last):
...
FileNotFoundError: [Errno 2] No such file or directory: 'test.txt'
>>> df = read_worlddb_gdp('data/WorldDataBank-GDP.csv')
>>> df.head() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Country Name Country Code ... 2019 2020
0 Afghanistan AFG ... 19291104007.6135 19807067268.1084
1 Albania ALB ... 15286612572.6895 14799615097.1008
2 Algeria DZA ... 171157803367.473 145163902228.168
3 American Samoa ASM ... 638000000 NaN
4 Andorra AND ... 3155065487.51819 NaN
<BLANKLINE>
[5 rows x 63 columns]
>>> df.tail() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Country Name ... 2020
261 Sub-Saharan Africa ... 1687597704644.75
262 Sub-Saharan Africa (excluding high income) ... 1686475026320.97
263 Sub-Saharan Africa (IDA & IBRD countries) ... 1687597704644.75
264 Upper middle income ... 23104877770162.5
265 World ... 84577962952008.3
<BLANKLINE>
[5 rows x 63 columns]
>>> df = read_worlddb_gdp('data/WorldDataBank-GDP.csv', countries=['IRL', 'IMN', 'GBR'])
>>> df.head() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Country Name Country Code ... 2019 2020
93 Ireland IRL ... 399122063504.148 425888950992.003
94 Isle of Man IMN ... NaN NaN
205 United Kingdom GBR ... 2830813507746.87 2707743777173.91
<BLANKLINE>
[3 rows x 63 columns]
>>> df = read_worlddb_gdp('data/WorldDataBank-GDP.csv', min_year=1984, max_year=1985)
>>> df.head()
Country Name Country Code 1984 1985
0 Afghanistan AFG NaN NaN
1 Albania ALB 1857338011.85488 1897050133.42015
2 Algeria DZA 53698278905.9678 57937868670.1937
3 American Samoa ASM NaN NaN
4 Andorra AND 330070689.298282 346737964.774951
>>> df = read_worlddb_gdp('data/WorldDataBank-GDP.csv', min_year=1975, max_year=1975, countries=['USA'])
>>> print(df)
Country Name Country Code 1975
206 United States USA 1684904000000
"""
# Raise an error if the years requested are outside of the year bounds
min_max_year_checking(min_year=min_year, min_year_possible=1960, max_year=max_year,
max_year_possible=(date.today().year - 1))
# Rather than specifying by year, be more usable for the future by dropping unneeded columns
df = pd.read_csv(filename, header=0, dtype={'Country Name': 'string', 'Country Code': 'string'})
df.drop(columns=['Series Name', 'Series Code'], inplace=True)
# Trim off the excess year header data (e.g. [YR1971]), and set correct type for the years
col_names = list(df.columns)
year_type_dict = {}
for i, name in enumerate(col_names):
if name.endswith(']'):
col_names[i] = int(col_names[i].split(' ')[0])
year_type_dict[col_names[i]]: 'int16'
df.columns = col_names
df = df.astype(year_type_dict)
df = df[df['Country Code'].notna()] # Drop rows without country codes
df = df.replace('..', np.nan) # Convert the '..'s to NaNs
if countries:
countries = [c.upper() for c in countries] # Make sure all country codes are uppercase
relevant_countries = df['Country Code'].isin(countries) # Filter down to the ones we want
df = df[relevant_countries]
if max_year:
max_year_pos = df.columns.get_loc(int(max_year))
df = df.iloc[:, 0:(max_year_pos + 1)]
# Minimum year is trickier because we need to keep country info in the first two cols
if min_year:
min_year_pos = df.columns.get_loc(int(min_year))
df_right = df.iloc[:, min_year_pos:] # Grab the year columns
df_left = df.iloc[:, 0:2] # These are the country name and country code
# Append the year columns on the right to the country name/codes on the left
df_left[df_right.columns] = df_right.values
df = df_left
return df
def get_gdp_info(us_gdp: pd.DataFrame, df: pd.DataFrame):
"""
Get GDP info from US GDP data for each event in df.
:param us_gdp: US GDP data
:param df: the gdp dataframe for selected events
:return:
>>> pandemics_gdp = read_event_facts('data/event_facts.csv', types='Pandemics')
>>> us_gdp_test = pd.read_csv('data/gdp_usafacts.csv')
>>> get_gdp_info(us_gdp_test, pandemics_gdp)
"""
# go through event file and get start_year, end_year
for index, row in df.iterrows():
event_name = row['Event_Name']
start_year = int(row['Start_Year'])
before_event = start_year - 10
end_year = int(row['End_Year'])
after_event = end_year + 10
# If start year less than min_year, start from 1929
if start_year >= 1929:
if end_year > 2020:
end_year = 2020
if after_event > 2020:
after_event = 2020
if before_event < 1929:
before_event = 1929
# Slice GDP df according to event
event_gdp = us_gdp.loc[[0, 124, 131, 152], str(before_event): str(after_event)]
event_gdp = event_gdp.apply(pd.to_numeric) # convert all columns of DataFrame
event_gdp = event_gdp.T
event_gdp.columns = ['GDP', 'Personal consumption expenditures',
'Gross private domestic investment',
'Government consumption expenditures and gross investment']
end_interval = end_year - before_event
start_interval = start_year - before_event
# call plot_gdp for each event
plot_gdp(event_gdp, event_name, end_interval, start_interval)
def plot_gdp(gdp_df: pd.DataFrame, event_name: str, end_interval: int, start_interval: int):
"""
Plot gdp trend for given events.
:param gdp_df: the gdp dataframe for selected events
:param event_name: event name for plotting
:param end_interval: end year of an event
:param start_interval: start year of an event
:return: plot of the given dataframe
>>> us_gdp_df = pd.read_csv('data/gdp_usafacts.csv', header=0)
>>> event_gdp = us_gdp_df.loc[0, '1947': '1968']
>>> event_gdp = event_gdp.to_frame()
>>> plot_gdp(event_gdp, "Test Flu", 4, 2)
>>> image = plt.imread('Plots/GDP/Test Flu.png')
>>> print(image[0][0]) # doctest: +ELLIPSIS
[1. 1. 1. 1.]
"""
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(gdp_df)
ax.set_xlabel("Year")
ax.set_ylabel("Gross domestic product ($)")
ax.ticklabel_format(style='plain', axis='y')
x_bounds = ax.get_xlim()
y_bounds = ax.get_ylim()
ax.vlines(end_interval, y_bounds[0], y_bounds[1], colors='red', linestyles='dashed')
ax.annotate(text='End Year', xy=(end_interval + 0.5, (y_bounds[0] + y_bounds[1]) * 2 / 3))
ax.vlines(start_interval, y_bounds[0], y_bounds[1], colors='green', linestyles='solid')
ax.annotate(text='Start Year', xy=(start_interval - 2, (y_bounds[0] + y_bounds[1]) * 2 / 3))
plt.title("GDP fluctuations for " + event_name)
plt.xticks(rotation=45) # Rotates X-Axis Ticks by 45-degrees
plt.legend(['GDP', 'Personal consumption expenditures', 'Gross private domestic investment',
'Government consumption expenditures and gross investment'], loc='upper left')
plt.savefig('Plots/GDP/' + event_name + '.png')
def read_us_cpi(filename: str, min_year: Union[int, None] = None, max_year: Union[int, None] = None) -> pd.DataFrame:
""" Loads a CVS file with CPI information from the US Bureau of Labor Statistics into a Pandas dataframe. Also
averages all the month values for each year in order to only return one value for each year.
Data sets available at https://beta.bls.gov/dataViewer/view/timeseries/CUUR0000SA0
:param filename:
:param min_year:
:param max_year:
:return:
>>> read_us_cpi('test.txt') # doctest: +ELLIPSIS
Traceback (most recent call last):
...
FileNotFoundError: [Errno 2] No such file or directory: 'test.txt'
>>> df = read_us_cpi('data/bls_us_cpi.csv', min_year=2015)
>>> df.head()
Year Value
102 2015 237.016998
103 2016 240.007172
104 2017 245.119583
105 2018 251.106827
106 2019 255.657425
>>> df = read_us_cpi('data/bls_us_cpi.csv', max_year=1950)
>>> df.tail()
Year Value
33 1946 19.516666
34 1947 22.325001
35 1948 24.041666
36 1949 23.808332
37 1950 24.066668
>>> df = read_us_cpi('data/bls_us_cpi.csv', min_year=1990, max_year=1993)
>>> print(df)
Year Value
77 1990 130.658325
78 1991 136.191666
79 1992 140.316666
80 1993 144.458328
"""
# Raise an error if the years requested are outside of the year bounds
if min_year or max_year:
min_max_year_checking(min_year=min_year, max_year=max_year)
# Use float32 instead of float16 due to bug with float16 and percentages -
# see https://github.com/pandas-dev/pandas/issues/9220
df = pd.read_csv(filename, header=0, usecols=['Year', 'Value'], dtype={'Year': 'int16', 'Value': 'float32'})
# Calculate the averages for each year, and keep year as a column, not an index
df = df.groupby('Year').mean()
df.reset_index(inplace=True)
# If either minimum or maximum year was specified, filter down on these
if min_year or max_year:
# Find the minimum and maximum years if one of these wasn't given as a parameter
if not min_year:
min_year = int(df['Year'].min())
elif not max_year:
max_year = int(df['Year'].max())
df = df[df['Year'] >= min_year]
df = df[df['Year'] <= max_year]
return df
def add_cpi_values(event_df: pd.DataFrame, cpi_df: pd.DataFrame) -> pd.DataFrame:
""" Takes a dataframe with an Event, End_Year, y_start, and y_end columns, and a dataframe with 'Year' and 'Value'
column with CPI data, and returns a dataframe with the percentage change for each of those years for each event.
Note: Each event must have an equal length between their start and end years
:param event_df: A dataframe with 'Event_Name' column, and columns with 'y_start' and 'y_end' year
:param cpi_df: A dataframe with CPI data with a 'Year' column and 'Value' column
:return: A dataframe with the CPI percentage change for each year given for each event
>>> args = {'Event_Name': ['Event A', 'Event B'], 'y_start': [1990, 1994], 'y_end': [2000, 2002], 'End_Year': [1992, 2001]}
>>> events_df = pd.DataFrame(args)
>>> cpi_df = pd.DataFrame({'Year': list(range(1990, 2002)), 'Value': [x * x for x in range(1, 13)]})
>>> add_cpi_values(events_df, cpi_df) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: All events must have the same number of years between their start and end years.
>>> args = {'Event_Name': ['Event A', 'Event B'], 'y_start': [1990, 1994], 'y_end': [2000, 2004], 'End_Year': [1992, 2002]}
>>> events_df = pd.DataFrame(args)
>>> df = add_cpi_values(events_df, cpi_df)
>>> print(df)
Event A (ended 1992) Event B (ended 2002)
1 300.000000 44.000000
2 125.000000 36.111111
3 77.777778 30.612245
4 56.250000 26.562500
5 44.000000 23.456790
6 36.111111 21.000000
7 30.612245 19.008264
8 26.562500 NaN
9 23.456790 NaN
10 21.000000 NaN
"""
results = {}
for _, row in event_df.iterrows():
event = row['Event_Name']
plot_start_year = row['y_start']
plot_end_year = row['y_end']
event_end_year = row['End_Year']
# Select all the CPI values between the beginning and ending years
cpi_df_selected = trim_to_years(cpi_df, plot_start_year, plot_end_year, 'Year', pad='nan', pad_col_name='Value')
# Calculate the percentage changes. Value must be changed to float32 away from float16 due to a bug -
# see https://github.com/pandas-dev/pandas/issues/9220
if cpi_df_selected.dtypes['Value'] == 'float16':
cpi_df_selected = cpi_df_selected.astype({'Value': 'float32'})
# pct_change() will put in zeros from a value to NaN and we don't want that
# Save the nan positions, calculate pct_change, and then put all the NaNs back in
nan_value_positions = cpi_df_selected['Value'].isnull()
percent_change_df = cpi_df_selected['Value'].pct_change() * 100
percent_change_df[nan_value_positions] = np.nan
results['{} (ended {})'.format(event, event_end_year)] = percent_change_df.tolist()
# Some results will have different values because e.g. there haven't been x number of years passed since the end
try:
results_df = pd.DataFrame.from_dict(results)
except ValueError:
raise ValueError('All events must have the same number of years between their start and end years.')
results_df = results_df.iloc[1:, :] # drop first row in the dataframe since the values are NA
return results_df
def get_index(df_selected: pd.DataFrame, df_index: pd.DataFrame, data_type: str) -> pd.DataFrame:
"""
Get the selected type ("nominal" or "real") of the market index monthly data for the selected events.
:param df_selected: the selected events dataframe
:param df_index: the given market index historical data
:param data_type: could be "nominal" or "real"("real" is inflation adjusted "nominal" index data)
:return: the dataframe of the market index data from y_start year to y_end year for each selected events
>>> s = {'Event_Name': ['Event A', 'Event B', 'Event C'], 'y_start': [1909, 1990, 1994], 'y_end': [1911, 2000, 2002]}
>>> sdf = pd.DataFrame(s)
>>> idf = pd.DataFrame({'year': list(range(1930, 2020)), 'nominal': [x * x * 100 for x in range(1, 91)], 'real': [y * y for y in range(1, 91)]})
>>> get_index(sdf, idf, 'real')# doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: All arrays must be of the same length
>>> s1 = {'Event_Name': ['Event A', 'Event B'], 'y_start': [1990, 1994], 'y_end': [2000, 2004]}
>>> sdf1 = pd.DataFrame(s1)
>>> get_index(sdf1, idf, 'real')
['Event A', 'Event B']
Event A Event B
1 0.033056 0.031006
2 0.032518 0.030533
3 0.031998 0.030074
4 0.031494 0.029628
5 0.031006 0.029196
6 0.030533 0.028776
7 0.030074 0.028367
8 0.029628 0.027971
9 0.029196 0.027585
10 0.028776 0.027210
>>> get_index(sdf1, idf, 'nominal')
['Event A', 'Event B']
Event A Event B
1 0.033056 0.031006
2 0.032518 0.030533
3 0.031998 0.030074
4 0.031494 0.029628
5 0.031006 0.029196
6 0.030533 0.028776
7 0.030074 0.028367
8 0.029628 0.027971
9 0.029196 0.027585
10 0.028776 0.027210
"""
index_dict = {}
df_selected = df_selected.loc[df_selected["y_start"] >= 1928] # the earliest data available is in 1927/12
df_selected = df_selected.loc[df_selected["y_end"] < 2021] # the latest data for available is in 2021/11
event_list = df_selected["Event_Name"].tolist()
print(event_list)
for event in event_list:
row = df_selected[df_selected["Event_Name"] == event]
# Find the beginning and end years
start = row["y_start"].values[0]
end = row["y_end"].values[0]
df_index["year"] = df_index["year"].astype(int)
# Select all SP500/DJ values between the two beginning and ending years
index_selected = trim_to_years(df_index, start, end, 'year')
# Calculate the percentage changes
if data_type == "nominal":
pct_change = index_selected["nominal"].pct_change().tolist()
else:
pct_change = index_selected["real"].pct_change().tolist()
# Add all the resulting values to the dict, by event
index_dict[event] = pct_change
# Add the dict to the dataframe
final_df = pd.DataFrame.from_dict(index_dict)
final_df = final_df.iloc[1:, :] # drop first row in the dataframe since the values are NA
return final_df
def plot_sp_dj(df1: pd.DataFrame, df2: pd.DataFrame, year_num: int, plot_name: str):
"""
Plot the given sp500 and dow jones for selected events dataframe.
:param df1: the dataframe with sp500 data for selected events
:param df2: the dataframe with dow jones data for selected events
:param year_num: the years before and after the selected zero point
:param plot_name: the string name of the plot
:return: plot of the given dataframe
"""
# change index of data to make the January of "Year Zero" as 0 in x-axis.
df1.index = df1.index - (12 * year_num)
df2.index = df2.index - (12 * year_num)
# get the 25 and 75 percentile bounds for plotting
df1 = add_mean_and_quartiles(df1)
df2 = add_mean_and_quartiles(df2)
fig, (ax3, ax4) = plt.subplots(2, sharex=True, figsize=(10, 5))
fig.suptitle('Change of Stock Market Indexes')
# ax1.plot(df1, linewidth=0.5)
# ax1.set_ylabel("Change of SP500", fontsize = 'x-small')
# ax2.plot(df2, linewidth=0.5)
# ax2.set_ylabel("Change of Dow Jones", fontsize = 'x-small')
ax3.plot(df1.index, df1["75pct"], color='black', label='75% percentile', linewidth=0.5)
ax3.plot(df1.index, df1["25pct"], color='black', label='25% percentile', linewidth=0.5)
ax3.plot(df1.index, df1["median"], '--', color='orange', label='median', linewidth=0.5)
ax3.hlines(y=0, xmin=- (12 * year_num), xmax=12 * (year_num + 1), linewidth=2, color='r')
ax3.vlines(x=0, ymin=-0.1, ymax=0.1, linestyles='dashed', linewidth=2, color='r')
ax3.vlines(x=11, ymin=-0.1, ymax=0.1, linestyles='dashed', linewidth=2, color='r')
ax3.fill_between(df1.index, df1["75pct"], df1["25pct"], facecolor='lightgreen')
ax3.set_ylabel("Range of SP500", fontsize='x-small')
ax4.plot(df2.index, df2["75pct"], color='black', label='75% percentile', linewidth=0.5)
ax4.plot(df2.index, df2["25pct"], color='black', label='25% percentile', linewidth=0.5)
ax4.plot(df2.index, df2["median"], '--', color='orange', label='median', linewidth=0.5)
ax4.hlines(y=0, xmin=- (12 * year_num), xmax=12 * (year_num + 1), linewidth=2, color='r')
ax4.vlines(x=0, ymin=-0.1, ymax=0.1, linestyles='dashed', linewidth=2, color='r')
ax4.vlines(x=11, ymin=-0.1, ymax=0.1, linestyles='dashed', linewidth=2, color='r')
ax4.fill_between(df2.index, df2["75pct"], df2["25pct"], facecolor='lightblue')
ax4.set_xlim(-12 * year_num + 1, 12 * (year_num + 1))
ax4.set_xlabel(str(year_num) + " Year Before and After Events (month)")
ax4.set_ylabel("Range of Dow Jones", fontsize='x-small')
plt.savefig('Plots/StockIndex/' + plot_name + '.png', dpi=200)
def output_sp_dj(df_e: pd.DataFrame, df_sp: pd.DataFrame, df_dj: pd.DataFrame, zero_point: str, year_l: int,
d_type: str):
"""
manage plots of stock market indexes by changing parameters for event selection criteria.
:param df_e: the detailed event facts in pd.DataFrame
:param df_sp: the given historical SP500 data in pd.DataFrame
:param df_dj: the given historical Dow Jones data in pd.DataFrame
:param zero_point: the specific year used as the zero point in selecting the time range, the value of t0 could be one of
["start year", "end year", "the year before end year", "the year after start year"]
:param year_l: the number of years to study before and after the year used as "zero point"
:param d_type: the type of SP500 or Dow Jones historical data to study, could be "real" or "nominal"
:return: plots for specified event selection criteria
"""
df_e = add_time_range(df_e, zero_point, year_l)
name_str = str(year_l) + "y_" + zero_point + "_" + d_type
print("The evolution of {} SP500 and Dow Jones {} years before and after all the Pandemics:".format(d_type, year_l))
selected_p = df_e.loc[df_e["Type"] == "Pandemics"]
p1_df = get_index(selected_p, df_sp, d_type)
p2_df = get_index(selected_p, df_dj, d_type)
plot_sp_dj(p1_df, p2_df, year_l, name_str + "_all_pandemics")
print("The evolution of {} SP500 and Dow Jones {} years before and after all the Wars:".format(d_type, year_l))
selected_w = df_e.loc[df_e["Type"] == "War"]
w1_df = get_index(selected_w, df_sp, d_type)
w2_df = get_index(selected_w, df_dj, d_type)
plot_sp_dj(w1_df, w2_df, year_l, name_str + "_all_wars")
print("The evolution of {} SP500 and Dow Jones {} years before and after Pandemics with over 1m fatalities:"
.format(d_type, year_l))
selected_1m = df_e.loc[df_e.Fatalities.isin(["1-10m", "10-100m", ">100m"])]
selected_1m_p = selected_1m.loc[selected_1m["Type"] == "Pandemics"]
p1_df_1 = get_index(selected_1m_p, df_sp, d_type)
p2_df_1 = get_index(selected_1m_p, df_dj, d_type)
plot_sp_dj(p1_df_1, p2_df_1, year_l, name_str + "_pandemics_over_1m_fatalities")
print("The evolution of {} SP500 and Dow Jones {} years before and after Wars with over 1m fatalities:"
.format(d_type, year_l))
selected_1m_w = selected_1m.loc[selected_1m["Type"] == "War"]
w1_df_1 = get_index(selected_1m_w, df_sp, d_type)
w2_df_1 = get_index(selected_1m_w, df_dj, d_type)
plot_sp_dj(w1_df_1, w2_df_1, year_l, name_str + "_wars_over_1m_fatalities")
def plot_cpi(df: pd.DataFrame, plot_name: str, title: str, x_label: str, y_label: str, plot_quartiles: bool = False,
plot_mean: bool = False) -> None:
"""
Takes a Pandas dataframe with event and CPI data and plots it, with optional quartiles and mean value.
:param df: A dataframe with CPI values for various events as the column titles and their values by relative year below
:param plot_name: The file name to be used for the plot
:param title: The title to be used for the plot
:param x_label: The x label to be used for the plot
:param y_label: The y label to be used for the plot
:param plot_quartiles: Whether quartiles (25%, 50% aka mean, and 75%) should be plotted - requires these as columns
:param plot_mean: Whether the the mean should be plotted - requires this as a column
:return: None
"""
# Create an empty graph and axes
figure, axes = plt.subplots(figsize=(15, 10))
# Either regular values can be plotted, or the quartiles and/or mean
if plot_quartiles or plot_mean:
if plot_quartiles:
axes.plot(df.index, df['75pct'], color='black', label='75% percentile', linewidth=1)
axes.plot(df.index, df['25pct'], color='black', label='25% percentile', linewidth=1)
axes.plot(df.index, df['median'], color='darkgreen', label='median', linewidth=1)
axes.fill_between(df.index, df['75pct'], df['25pct'], facecolor='lightgreen')
if plot_mean:
axes.plot(df.index, df['mean'], '-.', color='purple', label='mean', linewidth=1)
axes.set_xlabel(x_label)
axes.set_ylabel(y_label)
axes.set_title(title)
axes.set_xticks(df.index.to_list())
axes.legend()
else:
# Plotting normal values
df.plot(ax=axes, xlabel=x_label, ylabel=y_label, title=title, xticks=df.index.to_list())
# Save to disk
figure.savefig(plot_name, dpi=200)
def plot_all_cpi_graphs(pandemics_cpi_df, wars_cpi_df):
""" Takes dataframes with CPI information as well as pandemics and wars and is responsible for configuring
all of the plots for these.
:param pandemics_cpi_df: The dataframe with pandemic and CPI information
:param wars_cpi_df: THe dataframe with war and CPI information
:return:
"""
# Plot individual events as lines
plot_cpi(pandemics_cpi_df, 'Plots/CPI/all_pandemics.png', title='Individual Pandemics vs CPI Change',
x_label='Years +/- End of Pandemic', y_label='Year on Year CPI % Change')
plot_cpi(wars_cpi_df, 'Plots/CPI/all_wars.png', title='Individual Wars vs CPI Change',
x_label='Years +/- End of War', y_label='Year on Year CPI % Change')
# Calculate quartiles so these can be plotted
pandemics_cpi_df = add_mean_and_quartiles(pandemics_cpi_df)
wars_cpi_df = add_mean_and_quartiles(wars_cpi_df)
# Plot averages for pandemics (quartiles and mean)
plot_cpi(pandemics_cpi_df, 'Plots/CPI/pandemics_quartiles_mean.png', title='Averaged Pandemics vs CPI Change '
'Quartiles and Mean',
x_label='Years +/- End of Pandemic',
y_label='Year on Year CPI % Change', plot_quartiles=True, plot_mean=True)
# Plot averages for wars (quartiles and mean)
plot_cpi(wars_cpi_df, 'Plots/CPI/wars_quartiles_mean.png', title='Averaged Wars vs CPI Change Quartiles and Mean',
x_label='Years +/- End of War',
y_label='Year on Year CPI % Change', plot_quartiles=True, plot_mean=True)
def analyze_gdp(gdp_file: str, events_file: str) -> None:
"""
For each event, select gdp data 10 year before and after, and pass the df to plot_gdp().
:param gdp_file: gdp data file name
:param events_file: events file name
:return:
"""
# read in us gdp file
us_gdp_df = pd.read_csv(gdp_file, header=0)
# separate pandemics and wars gdp dataframe
pandemics_gdp = read_event_facts(events_file, types='Pandemics')
wars_gdp = read_event_facts(events_file, types='War')
# get gdp_info for each pandemic/war events
get_gdp_info(us_gdp_df, pandemics_gdp)