Web Scrapping
Data Scraping using Beautiful Soap
Import beautiful soap
Make a GET request to fetch page data
Parse HTML
Filter relevant parts
Step - 1: Fetch page data in HTML form
!pip install bs4
from urllib.request import urlopen
url = "https://en.wikipedia.org/wiki/Bengal"
data = urlopen(url)
print(type(data))
dhtml = data.read()
print(dhtml)
Step-2: Filter Page Data
from bs4 import BeautifulSoup as soup
dsoup = soup(dhtml, 'html.parser')
print(type(dsoup))
dsoup.findAll('h1',{})
#findAll returns data as a list
table = dsoup.findAll('table',{'class':'sortable wikitable'})
print(len(table))
#printing 1st table in case is there are multiple tables with class name sortable wikitable
#but here we have only 1 table so table and dtable will return same table
dtable = table[0]
print(dtable)
heading = dtable.findAll('th')
#check html code and find headers
#th stands for table heading
print(len(heading))
#print first heading
print(heading[0].text)
#print all the titles
ctitles = [ct.text[:-1] for ct in heading]
print(ctitles)
# tr stands for table rows
# here we are finding table rows
rowdata = dtable.findAll('tr')[1:]
print(rowdata)
print(len(rowdata))
# 'td' is to find all the table data
# here we are finding first row data-- rowdata[0]
firstrow = rowdata[0].findAll('td',{})
for d in firstrow:
print(d.text)
# finding all data rows
r1 = []
for rows_data in rowdata:
r2 = []
row_data = rows_data.findAll("td",{})
for idx,row in enumerate(row_data):
if idx !=1 and idx != 0 and idx ==2:
r2.append(row.text[1:-1])
elif idx !=1 and idx != 0 and idx !=2:
r2.append(row.text[:-1])
else:
r2.append(row.text)
r1.append(r2)
print(r1)
Understanding the concept of idx,row in enumerate(row_data):
abc = ['I', 'love', 'python']
for i,n in enumerate(abc):
print(i)
print(n)
0
I
1
love
2
python
Copying the Data to CSV file:
# importing the csv module
import csv
# field names
fields = ['Rank', 'City', 'Country', 'Population', 'Images']
filename = "C:/Users/Shalki/Desktop/python/Book1.csv"
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
# writing the data rows
csvwriter.writerows(r1)
Downloading files from web using Python
# imported the requests library
import requests
image_url = "https://en.wikipedia.org/wiki/Image#/media/File:Image_created_with_a_mobile_phone.png"
r = requests.get(image_url)
with open("shreyansh.png",'wb') as f:
f.write(r.content)
Note: Image can be seen in compiler (jupyter) local host
Download large files:
import requests
file_url = "http://codex.cs.yale.edu/avi/db-book/db4/slide-dir/ch1-2.pdf"
r = requests.get(file_url, stream = True)
with open("python.pdf","wb") as pdf:
for chunk in r.iter_content(chunk_size=1024):
# writing one chunk at a time to pdf file
if chunk:
pdf.write(chunk)
Downloading Videos:
import requests
from bs4 import BeautifulSoup
archive_url = "http://www-personal.umich.edu/~csev/books/py4inf/media/"
def get_video_links():
# create response object
r = requests.get(archive_url)
# create beautiful-soup object
soup = BeautifulSoup(r.content,'html5lib')
# find all links on web-page
links = soup.findAll('a')
# filter the link sending with .mp4
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('mp4')]
return video_links
def download_video_series(video_links):
print(video_links)
for link in video_links:
file_name = link.split('/')[-1]
print ("Downloading file:%s"%file_name)
# create response object
r = requests.get(link, stream = True)
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ("%s downloaded!\n"%file_name)
print ("All videos downloaded!")
return
if __name__ == "__main__":
# getting all video links
video_links = get_video_links()
# download all videos
download_video_series(video_links)
Last updated