-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrape Paystubs.py
485 lines (398 loc) · 22.6 KB
/
Scrape Paystubs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
""" Scrape Text from Pay Stubs """
import glob
import pdfplumber
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
import numpy as np
def scrape_pay_stubs(folder):
"""
Indexing notes:
string.find(i) - returns index position of the first appearance of the desired keyword
len(i) - returns length of keyword string; to get to the index number at the end of the entire keyword
string.find('<end>', string.find(i)) - returns index when <end> appears, starting at the keyword index instead of whole string
string[(string.find(i) + len(i)):(string.find('<end>', string.find(i)))] - returns desired value based on index
"""
out = [] # Empty list to store information from all PDFs
for file in folder:
with pdfplumber.open(file) as one_pdf:
string = ''
for i in range(0, len(one_pdf.pages)):
string += one_pdf.pages[i].extract_text().replace('\n', '<end>') # Write each page of single PDF to empty string
# Keeps <end> as indicator for end of each line of string
inner = []
" General Information and Leave "
keywords = [
'Pay Period Ending : ', 'Net Pay $ : ', 'Pay Period # : ', 'Pay Date : ', 'Pay Plan : ', 'Pay Grade : ',
'Pay Step : ', 'Annual Salary $ : ', 'Hourly Rate $ : ', 'YTD Wages: ', 'Gross Pay YTD: ',
'Total Deductions YTD: ', 'Maximum Carry Over: ', 'Use Or Lose Balance: ', 'Annual Leave Begin Balance Current: ',
'Annual Leave Begin Balance Leave Year: ', 'Annual Leave Earned Current: ', 'Annual Leave Earned YTD: ',
'Annual Leave Used Current: ', 'Annual Leave Used YTD: ', 'Annual Leave Advanced: ',
'Annual Leave Ending Balance: ', 'Sick Leave Begin Balance Current: ', 'Sick Leave Begin Balance Leave Year: ',
'Sick Leave Earned Current: ', 'Sick Leave Earned YTD: ', 'Sick Leave Used Current: ', 'Sick Leave Used YTD: ',
'Sick Leave Advanced: ', 'Sick Leave Ending Balance: ', 'Time Off Award Begin Balance Current: ',
'Time Off Award Begin Balance Leave Year: ', 'Time Off Award Earned Current: ', 'Time Off Award Earned YTD: ',
'Time Off Award Used Current: ', 'Time Off Award Used YTD: ', 'Time Off Award Advanced: ',
'Time Off Award Ending Balance: '
]
for i in keywords:
if i in string: # If statement because certain keywords do not appear in each PDF
inner.append(string[(string.find(i) + len(i)):(string.find('<end>', string.find(i)))])
else:
inner.append('')
" Other Award "
if 'Other Award' in string:
other_award_substring = string[string.find('Other Award'):]
inner.append(other_award_substring[
other_award_substring.find('Adj Hours | ') + len('Adj Hours | '):other_award_substring.find(
' Current PPD')])
else:
inner.append(0)
" Home Address "
inner.append(string[(string.find('Home Address') + len('Home Address') + len('<end>')):
(string.find('Pay Check', string.find('Home Address'))) - len('<end>')])
" Deductions "
keywords2 = [
'Federal Taxes', 'State Tax 1 ( DC )', 'State Tax 1 ( VA )', 'State Tax 2 ( DC )', 'Health Benefits - Pretax',
'Dental/Vision', 'TSP Tax Deferred', 'Retirement - FERS/FRAE', 'OASDI Tax', 'Medicare Tax', 'FEGLI - Regular'
]
for i in keywords2:
deductions_substring = string[string.find(i):]
if i in string:
# Federal Taxes Adjusted
inner.append(string[(string.find(i) + len(i)):(string.find('Adjusted', string.find(i)))])
else:
inner.append(0)
if i in string:
# Federal Taxes Current
inner.append(deductions_substring[(deductions_substring.find('Misc | ') + len('Misc | ')):(
deductions_substring.find('Current', deductions_substring.find('Misc | ')))])
else:
inner.append(0)
if i in string:
# Federal Taxes YTD
inner.append(deductions_substring[(deductions_substring.find('Current PPD | ') + len('Current PPD | ')):(
deductions_substring.find(' YTD', deductions_substring.find('Current PPD | ')))])
else:
inner.append(0)
" Basic Info "
keywords3 = [
'Service Comp Date', 'Agency', 'Cumulative Retirement Agency $', 'Duty Station', 'Pay Begin Date',
'Financial Institution', 'TSP Tax Deferred Amt/%'
]
for i in keywords3:
inner.append(string[(string.find(i) + len(i)):(string.find(':', string.find(i)))])
" Benefits "
benefits_substring = string[string.find('Benefits Paid by Government'):]
keywords4 = [
'FEGLI', 'Medicare', 'OASDI', 'TSP Basic', 'TSP Matching', 'FERS/FRAE'
]
for i in keywords4:
# Benefits Current
inner.append(benefits_substring[
(benefits_substring.find(i) + len(i)):(
benefits_substring.find(' Current', benefits_substring.find(i)))])
# Benefits YTD
inner.append(benefits_substring[(benefits_substring.find('Current PPD | ') + len('Current PPD | ')):(
benefits_substring.find(' YTD', benefits_substring.find('Current PPD | ')))])
" Append all scraped values from inner list to outer list "
out.append(inner)
return out
pay_stubs = glob.glob('Projects/Scrape-Paystubs/Pay Stubs - FERC/*.pdf')
scraped_pay_data = scrape_pay_stubs(pay_stubs)
" Create DataFrame "
headings = ['Pay Period Ending', 'Net Pay', 'Pay Period', 'Pay Date', 'Pay Plan', 'Pay Grade', 'Pay Step', 'Annual Salary',
'Hourly Rate', 'YTD Wages', 'Gross Pay YTD', 'Total Deductions YTD', 'Maximum Carry Over', 'Use Or Lose Balance',
'Annual Leave Begin Balance', 'Annual Leave Begin Balance Leave Year', 'Annual Leave', 'Annual Leave Earned YTD',
'Annual Leave Used', 'Annual Leave Used YTD', 'Annual Leave Advanced', 'Annual Leave Total',
'Sick Leave Begin Balance', 'Sick Leave Begin Balance Leave Year', 'Sick Leave', 'Sick Leave Earned YTD',
'Sick Leave Used', 'Sick Leave Used YTD', 'Sick Leave Advanced', 'Sick Leave Total',
'Time Off Award Begin Balance', 'Time Off Award Begin Balance Leave Year', 'Time Off Award',
'Time Off Award Earned YTD', 'Time Off Award Used', 'Time Off Award Used YTD', 'Time Off Award Advanced',
'Time Off Award Total', 'Other Award', 'Home Address', 'Federal Taxes Adjusted', 'Federal Taxes', 'Federal Taxes YTD',
'State Tax 1 ( DC ) Adjusted', 'State Tax 1 ( DC )', 'State Tax 1 ( DC ) YTD', 'State Tax 1 ( VA ) Adjusted',
'State Tax 1 ( VA )', 'State Tax 1 ( VA ) YTD', 'State Tax 2 ( DC ) Adjusted', 'State Tax 2 ( DC )',
'State Tax 2 ( DC ) YTD', 'Health Benefits Adjusted', 'Health Benefits', 'Health Benefits YTD',
'Dental/Vision Adjusted', 'Dental/Vision', 'Dental/Vision YTD', 'TSP Tax Deferred Adjusted', 'TSP Tax Deferred',
'TSP Tax Deferred YTD', 'Retirement - FERS/FRAE Adjusted', 'Retirement - FERS/FRAE', 'Retirement - FERS/FRAE YTD',
'OASDI Tax Adjusted', 'OASDI Tax', 'OASDI Tax YTD', 'Medicare Tax Adjusted', 'Medicare Tax', 'Medicare Tax YTD',
'FEGLI - Regular Adjusted', 'FEGLI - Regular', 'FEGLI - Regular YTD', 'Service Comp Date', 'Agency',
'Cumulative Retirement Agency', 'Duty Station', 'Pay Begin Date', 'Financial Institution', 'TSP Tax Deferred Amt',
'FEGLI', 'FEGLI YTD', 'Medicare', 'Medicare YTD', 'OASDI', 'OASDI YTD', 'TSP Basic', 'TSP Basic YTD', 'TSP Matching',
'TSP Matching YTD', 'FERS/FRAE', 'FERS/FRAE YTD'
]
df_scraped_pay = pd.DataFrame(scraped_pay_data, columns=headings)
def convert_datatypes(df):
vars_to_datetime = [
'Pay Period Ending', 'Pay Date', 'Service Comp Date', 'Pay Begin Date'
]
df[vars_to_datetime] = df[vars_to_datetime].apply(pd.to_datetime)
vars_to_int = [
'Pay Period', 'Pay Grade', 'Pay Step', 'Maximum Carry Over', 'Use Or Lose Balance', 'Annual Leave Begin Balance',
'Annual Leave Begin Balance Leave Year', 'Annual Leave', 'Annual Leave Earned YTD', 'Annual Leave Used',
'Annual Leave Used YTD', 'Annual Leave Advanced', 'Annual Leave Total', 'Sick Leave Begin Balance',
'Sick Leave Begin Balance Leave Year', 'Sick Leave', 'Sick Leave Earned YTD', 'Sick Leave Used', 'Sick Leave Used YTD',
'Sick Leave Advanced', 'Sick Leave Total', 'Time Off Award Begin Balance', 'Time Off Award Begin Balance Leave Year',
'Time Off Award', 'Time Off Award Earned YTD', 'Time Off Award Used', 'Time Off Award Used YTD',
'Time Off Award Advanced',
'Time Off Award Total'
]
df[vars_to_int] = df[vars_to_int].apply(pd.to_numeric).fillna(0).astype(int)
vars_to_float = [
'Net Pay', 'Annual Salary', 'Hourly Rate', 'YTD Wages', 'Gross Pay YTD', 'Total Deductions YTD', 'Federal Taxes Adjusted',
'Other Award', 'Federal Taxes', 'Federal Taxes YTD', 'State Tax 1 ( DC ) Adjusted', 'State Tax 1 ( DC )',
'State Tax 1 ( DC ) YTD', 'State Tax 1 ( VA ) Adjusted', 'State Tax 1 ( VA )', 'State Tax 1 ( VA ) YTD',
'State Tax 2 ( DC ) Adjusted', 'State Tax 2 ( DC )', 'State Tax 2 ( DC ) YTD', 'Health Benefits Adjusted',
'Health Benefits', 'Health Benefits YTD', 'Dental/Vision Adjusted', 'Dental/Vision', 'Dental/Vision YTD',
'TSP Tax Deferred Adjusted', 'TSP Tax Deferred', 'TSP Tax Deferred YTD', 'Retirement - FERS/FRAE Adjusted',
'Retirement - FERS/FRAE', 'Retirement - FERS/FRAE YTD', 'OASDI Tax Adjusted', 'OASDI Tax', 'OASDI Tax YTD',
'Medicare Tax Adjusted', 'Medicare Tax', 'Medicare Tax YTD', 'FEGLI - Regular Adjusted', 'FEGLI - Regular',
'FEGLI - Regular YTD', 'Cumulative Retirement Agency', 'FEGLI', 'FEGLI YTD', 'Medicare', 'Medicare YTD', 'OASDI',
'OASDI YTD', 'TSP Basic', 'TSP Basic YTD', 'TSP Matching', 'TSP Matching YTD', 'FERS/FRAE', 'FERS/FRAE YTD'
]
df[vars_to_float] = df[vars_to_float].replace(',', '', regex=True).apply(pd.to_numeric, errors='coerce').fillna(0)
# Clean up remaining fields
df['TSP Tax Deferred Amt'] = df['TSP Tax Deferred Amt'].replace(' ', '', regex=True)
df['Home Address'] = df['Home Address'].replace('<end>', '', regex=True)
return df
df_scraped_pay = convert_datatypes(df_scraped_pay)
df_scraped_pay['Year'] = df_scraped_pay['Pay Period Ending'].dt.year
paystubs_master = df_scraped_pay[[
'Agency',
'Duty Station',
'Service Comp Date',
'Pay Period',
'Pay Date',
'Pay Begin Date',
'Pay Period Ending',
'Year',
'Pay Plan',
'Pay Grade',
'Pay Step',
'Annual Salary',
'Gross Pay YTD',
'Net Pay',
'Hourly Rate',
'YTD Wages',
'Total Deductions YTD',
'Financial Institution',
'Home Address',
'Maximum Carry Over',
'Use Or Lose Balance',
'Annual Leave Begin Balance',
'Annual Leave Begin Balance Leave Year',
'Annual Leave',
'Annual Leave Earned YTD',
'Annual Leave Used',
'Annual Leave Used YTD',
'Annual Leave Advanced',
'Annual Leave Total',
'Sick Leave Begin Balance',
'Sick Leave Begin Balance Leave Year',
'Sick Leave',
'Sick Leave Earned YTD',
'Sick Leave Used',
'Sick Leave Used YTD',
'Sick Leave Advanced',
'Sick Leave Total',
'Time Off Award Begin Balance',
'Time Off Award Begin Balance Leave Year',
'Time Off Award',
'Time Off Award Earned YTD',
'Time Off Award Used',
'Time Off Award Used YTD',
'Time Off Award Advanced',
'Time Off Award Total',
'Other Award',
'Federal Taxes Adjusted',
'Federal Taxes',
'Federal Taxes YTD',
'State Tax 1 ( DC ) Adjusted',
'State Tax 1 ( DC )',
'State Tax 1 ( DC ) YTD',
'State Tax 1 ( VA ) Adjusted',
'State Tax 1 ( VA )',
'State Tax 1 ( VA ) YTD',
'State Tax 2 ( DC ) Adjusted',
'State Tax 2 ( DC )',
'State Tax 2 ( DC ) YTD',
'Health Benefits Adjusted',
'Health Benefits',
'Health Benefits YTD',
'Dental/Vision Adjusted',
'Dental/Vision',
'Dental/Vision YTD',
'TSP Tax Deferred Adjusted',
'TSP Tax Deferred',
'TSP Tax Deferred YTD',
'Retirement - FERS/FRAE Adjusted',
'Retirement - FERS/FRAE',
'Retirement - FERS/FRAE YTD',
'OASDI Tax Adjusted',
'OASDI Tax',
'OASDI Tax YTD',
'Medicare Tax Adjusted',
'Medicare Tax',
'Medicare Tax YTD',
'FEGLI - Regular Adjusted',
'FEGLI - Regular',
'FEGLI - Regular YTD',
'Cumulative Retirement Agency',
'TSP Tax Deferred Amt',
'FEGLI',
'FEGLI YTD',
'Medicare',
'Medicare YTD',
'OASDI',
'OASDI YTD',
'TSP Basic',
'TSP Basic YTD',
'TSP Matching',
'TSP Matching YTD',
'FERS/FRAE',
'FERS/FRAE YTD'
]]
paystubs_master = paystubs_master.sort_values(by=['Pay Date']).copy().reset_index(drop=True)
# paystubs_master.to_csv('Projects/Scrape-Paystubs/Output/Earnings and Leave Statements Master.csv', index=False)
" Summarize Pay and Deductions "
# Components of gross pay
gross_pay = paystubs_master[[
'Pay Period Ending', 'Year', 'Net Pay', 'Federal Taxes', 'State Tax 1 ( DC )', 'State Tax 1 ( VA )', 'State Tax 2 ( DC )',
'Health Benefits', 'Dental/Vision', 'TSP Tax Deferred', 'Retirement - FERS/FRAE', 'OASDI Tax', 'Medicare Tax',
'FEGLI - Regular'
]].copy()
gross_pay_long = gross_pay.melt(
id_vars=['Pay Period Ending', 'Year'],
var_name='Description',
value_name='Amount'
)
gross_pay_long.loc[gross_pay_long['Description'] == 'Net Pay', 'Category'] = 'Net Pay'
gross_pay_long.loc[gross_pay_long['Description'] == 'Federal Taxes', 'Category'] = 'Taxes'
gross_pay_long.loc[gross_pay_long['Description'] == 'State Tax 1 ( DC )', 'Category'] = 'Taxes'
gross_pay_long.loc[gross_pay_long['Description'] == 'State Tax 1 ( VA )', 'Category'] = 'Taxes'
gross_pay_long.loc[gross_pay_long['Description'] == 'State Tax 2 ( DC )', 'Category'] = 'Taxes'
gross_pay_long.loc[gross_pay_long['Description'] == 'Health Benefits', 'Category'] = 'Health Insurance'
gross_pay_long.loc[gross_pay_long['Description'] == 'Dental/Vision', 'Category'] = 'Health Insurance'
gross_pay_long.loc[gross_pay_long['Description'] == 'TSP Tax Deferred', 'Category'] = 'Retirement'
gross_pay_long.loc[gross_pay_long['Description'] == 'Retirement - FERS/FRAE', 'Category'] = 'Retirement'
gross_pay_long.loc[gross_pay_long['Description'] == 'OASDI Tax', 'Category'] = 'Taxes'
gross_pay_long.loc[gross_pay_long['Description'] == 'Medicare Tax', 'Category'] = 'Taxes'
gross_pay_long.loc[gross_pay_long['Description'] == 'FEGLI - Regular', 'Category'] = 'Retirement'
pay_annual = gross_pay_long.groupby(['Year', 'Category'])['Amount'].sum().reset_index()
" Import Credit Card Statements "
def concat_chase_statements(folder):
for statement in folder:
concat_csv = pd.concat(pd.read_csv(statement) for statement in folder)
return concat_csv
chase_statements = glob.glob('Projects/Scrape-Paystubs/Expenses - Chase/*')
chase_concat = concat_chase_statements(chase_statements).reset_index(drop=True)
# Show unique type of transactions
pd.DataFrame(chase_concat["Type"].unique())
# Positive values for Sales and Adjustments
chase_concat['Amount'] = chase_concat['Amount'].abs()
# Negative values for Returns
chase_concat.loc[chase_concat.Type == 'Return', 'Amount'] = chase_concat['Amount'] * -1
# Drop (credit card) Payment transactions
chase_concat = chase_concat.loc[
(chase_concat['Type'] == 'Adjustment') |
(chase_concat['Type'] == 'Sale') |
(chase_concat['Type'] == 'Return')].copy()
chase_concat.rename(columns={'Category': 'Category Description'}, inplace=True)
chase_concat.rename(columns={'Description': 'Merchant'}, inplace=True)
chase_concat['Category'] = 'Expenses'
chase_concat['Transaction Date'] = chase_concat['Transaction Date'].apply(pd.to_datetime)
chase_concat['Year'] = pd.to_datetime(chase_concat['Transaction Date']).dt.year
chase_master = chase_concat[['Transaction Date', 'Year', 'Category', 'Category Description', 'Merchant', 'Amount']].copy()
chase_master.dtypes
def concat_amex_statements(folder):
for statement in folder:
concat_xlsx = pd.concat(pd.read_excel(statement, skiprows=6) for statement in folder)
return concat_xlsx
amex_statements = glob.glob('Projects/Scrape-Paystubs/Expenses - American Express/*')
amex_concat = concat_amex_statements(amex_statements)
amex_concat = amex_concat.loc[(amex_concat['Amount'] > 0)].copy().reset_index(drop=True)
amex_concat.rename(columns={'Date': 'Transaction Date'}, inplace=True)
amex_concat.rename(columns={'Description': 'Merchant'}, inplace=True)
amex_concat.rename(columns={'Category': 'Category Description'}, inplace=True)
amex_concat['Category'] = 'Expenses'
amex_concat['Year'] = pd.to_datetime(amex_concat['Transaction Date']).dt.year
amex_concat['Transaction Date'] = amex_concat['Transaction Date'].apply(pd.to_datetime)
amex_master = amex_concat[['Transaction Date', 'Year', 'Category', 'Category Description', 'Merchant', 'Amount']].copy()
amex_master.dtypes
" Concatenate Credit Card Charges "
charges_concat = pd.concat([chase_master, amex_master], ignore_index=True)
# Condense expense categories
pd.DataFrame(charges_concat['Category Description'].unique())
charges_concat.loc[charges_concat['Category Description'] == 'Business Services-Office Supplies', 'Category Description'] = 'Streaming'
charges_concat.loc[charges_concat['Category Description'] == 'Communications-Cable & Internet Comm', 'Category Description'] = 'Streaming'
charges_concat.loc[charges_concat['Category Description'] == 'Home', 'Category Description'] = 'Shopping'
charges_concat.loc[charges_concat['Category Description'] == 'Merchandise & Supplies-Groceries', 'Category Description'] = 'Groceries'
charges_concat.loc[charges_concat['Category Description'] == 'Merchandise & Supplies-Internet Purchase', 'Category Description'] = 'Shopping'
charges_concat.loc[charges_concat['Category Description'] == 'Personal', 'Category Description'] = 'Shopping'
charges_concat.loc[charges_concat['Category Description'] == 'Restaurant-Bar & Café', 'Category Description'] = 'Food & Drink'
charges_concat.loc[charges_concat['Category Description'] == 'Restaurant-Restaurant', 'Category Description'] = 'Food & Drink'
charges_concat.loc[charges_concat['Category Description'] == 'Transportation-Fuel', 'Category Description'] = 'Gas'
charges_concat.loc[charges_concat['Category Description'] == 'Transportation-Parking Charges', 'Category Description'] = 'Travel'
charges_concat.loc[charges_concat['Category Description'] == 'Transportation-Taxis & Coach', 'Category Description'] = 'Travel'
# Recategorize merchant categories
charges_concat.sort_values(by='Category Description').reset_index(drop=True)
charges_concat.loc[charges_concat['Merchant'].str.contains('SPOTIFY|Spotify'), 'Category Description'] = 'Streaming'
charges_concat.loc[charges_concat['Merchant'].str.contains(
'WA VEHICLE LICENSING|IDENTOGO-IDEMIA TSA PRECH'), 'Category Description'] = 'Travel'
charges_concat.loc[charges_concat['Merchant'].str.contains('Amazon|PAYPAL'), 'Category Description'] = 'Shopping'
charges_concat.loc[charges_concat['Merchant'].str.contains('BRIG|Brig|NIELSON WINES'), 'Category Description'] = 'Food & Drink'
charges_concat.loc[charges_concat['Merchant'].str.contains(
'ACADIA NATIONAL PARK|THE NC ARBORETUM'), 'Category Description'] = 'Entertainment'
charges_concat = charges_concat[~charges_concat['Category Description'].str.contains('Fees & Adjustments')]
charges_concat = charges_concat[charges_concat['Year'] > 2019]
charges_master = charges_concat.sort_values(by=['Year', 'Transaction Date', 'Category Description', 'Merchant']).copy()
# charges_master.to_csv('Projects/Scrape-Paystubs/Output/Expenses Master.csv', index=False)
# charges_master.loc[
# (charges_master['Category Description'] == 'Streaming') | (charges_master['Category Description'] == 'Entertainment')].to_csv(
# 'Projects/Scrape-Paystubs/Output/Expenses Master - Sample.csv', index=False)
" Analyze Credit Card Charges "
# To concatenate with pay
charges_annual = charges_master.groupby(['Year', 'Category'])['Amount'].sum().reset_index()
# To plot
charges_description_annual = charges_master.groupby(['Year', 'Category Description'])['Amount'].sum().reset_index()
# Transform data long to wide by Year
charges_description_wide = charges_description_annual.pivot(
index='Category Description',
columns='Year',
values='Amount'
).reset_index()
" Plot Annual Charges "
plt.style.use('seaborn-notebook') # Set style
fig, ax = plt.subplots(figsize=(13, 8)) # Set figure
x_axis = np.arange(len(charges_description_wide['Category Description']))
# Plot
plt.bar(x_axis - 0.2, charges_description_wide.iloc[:, 1], width=0.4, label='2020')
plt.bar(x_axis + 0.2, charges_description_wide.iloc[:, 2], width=0.4, label='2021')
# Titles
plt.title('Annual Expenses by Category',
fontweight='bold',
fontsize=20)
plt.ylabel('Amount ($)',
fontweight='bold',
size=12)
plt.xticks(x_axis, charges_description_wide['Category Description'])
plt.tight_layout()
plt.legend()
plt.show()
# plt.savefig('Projects/Scrape-Paystubs/Output/Annual Expenses by Category.png')
# plt.savefig('Projects/Scrape-Paystubs/Output/Annual Expenses by Category.pdf')
" Concatenate Annual Pay and Charges "
pay_charge_concat = pd.concat([pay_annual, charges_annual], ignore_index=True)
pay_charge_annual = pay_charge_concat[['Year', 'Category', 'Amount']].sort_values(by=['Year', 'Category'])
" Calculate Savings by Year "
# Transform data long to wide by Year
pay_charge_wide = pay_charge_annual.pivot(
index='Year',
columns='Category',
values='Amount'
).reset_index().fillna(0)
gross_pay = paystubs_master.iloc[:, [7, 12]].loc[paystubs_master['Pay Period'] == 26]
pay_charge_wide = pay_charge_wide.merge(gross_pay, how='left', left_on=['Year'], right_on=['Year'])
pay_charge_wide['Savings'] = pay_charge_wide['Gross Pay YTD'] - pay_charge_wide['Taxes'] - pay_charge_wide['Retirement'] \
- pay_charge_wide['Health Insurance'] - pay_charge_wide['Expenses']
print(tabulate(pay_charge_wide, headers='keys', tablefmt='plain'))