-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDay10.py
40 lines (31 loc) · 1.31 KB
/
Day10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
from bs4 import BeautifulSoup
import re
import csv
def scrape_website(url):
# Send a GET request to the URL
response = requests.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the h1 tags and extract the text
h1_tags = soup.find_all('h1')
h1_texts = [tag.get_text() for tag in h1_tags]
# Find all the links and extract the href attribute
links = soup.find_all('a')
link_hrefs = [link.get('href') for link in links]
# Find all the images and extract the src attribute
images = soup.find_all('img')
image_srcs = [image.get('src') for image in images]
# Extract all the email addresses using regular expressions
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, response.text)
# Save the extracted data to a CSV file
with open('website_data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['H1 Tags', 'Links', 'Images', 'Emails'])
for i in range(len(h1_texts)):
row = [h1_texts[i], link_hrefs[i], image_srcs[i], emails[i]]
writer.writerow(row)
# Example usage
url = 'https://www.example.com'
scrape_website(url)