Web Scrapping

Data Scraping using Beautiful Soap

  • Import beautiful soap

  • Make a GET request to fetch page data

  • Parse HTML

  • Filter relevant parts

Step - 1: Fetch page data in HTML form

!pip install bs4
from urllib.request import urlopen
url = "https://en.wikipedia.org/wiki/Bengal"
data = urlopen(url)
print(type(data))
dhtml = data.read()
print(dhtml)

Step-2: Filter Page Data

from bs4 import BeautifulSoup as soup
dsoup = soup(dhtml, 'html.parser')
print(type(dsoup))
dsoup.findAll('h1',{})
#findAll returns data as a list
table = dsoup.findAll('table',{'class':'sortable wikitable'})
print(len(table))

#printing 1st table in case is there are multiple tables with class name sortable wikitable
#but here we have only 1 table so table and dtable will return same table 
dtable = table[0]
print(dtable)
heading = dtable.findAll('th')
#check html code and find headers
#th stands for table heading
print(len(heading))
#print first heading 
print(heading[0].text)
#print all the titles
ctitles = [ct.text[:-1] for ct in heading]
print(ctitles)
# tr stands for table rows
# here we are finding table rows
rowdata = dtable.findAll('tr')[1:]
print(rowdata)
print(len(rowdata))
# 'td' is to find all the table data
# here we are finding first row data-- rowdata[0]
firstrow = rowdata[0].findAll('td',{})
for d in firstrow:
       print(d.text)
# finding all data rows
r1 = []
for rows_data in rowdata:
    r2 = []
    row_data = rows_data.findAll("td",{})
    for idx,row in enumerate(row_data):
        if idx !=1 and idx != 0 and idx ==2:
            r2.append(row.text[1:-1])
        elif idx !=1 and idx != 0 and idx !=2:
            r2.append(row.text[:-1])
        else:
            r2.append(row.text)
    
    r1.append(r2)
print(r1)

Understanding the concept of idx,row in enumerate(row_data):

abc = ['I', 'love', 'python']
for i,n in enumerate(abc):
    print(i)
    print(n)

0

I

1

love

2

python

Copying the Data to CSV file:

# importing the csv module 
import csv 

# field names 
fields = ['Rank', 'City', 'Country', 'Population', 'Images'] 

filename = "C:/Users/Shalki/Desktop/python/Book1.csv"

# writing to csv file 
with open(filename, 'w') as csvfile: 
	# creating a csv writer object 
	csvwriter = csv.writer(csvfile) 
	
	# writing the fields 
	csvwriter.writerow(fields) 
	
	# writing the data rows 
	csvwriter.writerows(r1)

Downloading files from web using Python

# imported the requests library
import requests
image_url = "https://en.wikipedia.org/wiki/Image#/media/File:Image_created_with_a_mobile_phone.png"
r = requests.get(image_url)
with open("shreyansh.png",'wb') as f:
    f.write(r.content)

Note: Image can be seen in compiler (jupyter) local host

Download large files:

import requests
file_url = "http://codex.cs.yale.edu/avi/db-book/db4/slide-dir/ch1-2.pdf"
r = requests.get(file_url, stream = True)
with open("python.pdf","wb") as pdf:
 for chunk in r.iter_content(chunk_size=1024):
 # writing one chunk at a time to pdf file
 if chunk:
 pdf.write(chunk)
 

Downloading Videos:

import requests 
from bs4 import BeautifulSoup 
  
archive_url = "http://www-personal.umich.edu/~csev/books/py4inf/media/"
  
def get_video_links(): 
      
    # create response object 
    r = requests.get(archive_url) 
      
    # create beautiful-soup object 
    soup = BeautifulSoup(r.content,'html5lib') 
      
    # find all links on web-page 
    links = soup.findAll('a') 
  
    # filter the link sending with .mp4 
    video_links = [archive_url + link['href'] for link in links if link['href'].endswith('mp4')] 
  
    return video_links 
  
  
def download_video_series(video_links): 
    print(video_links)
    for link in video_links: 
        file_name = link.split('/')[-1]    
        print ("Downloading file:%s"%file_name) 
          
        # create response object 
        r = requests.get(link, stream = True) 
          
        # download started 
        with open(file_name, 'wb') as f: 
            for chunk in r.iter_content(chunk_size = 1024*1024): 
                if chunk: 
                    f.write(chunk) 
                    print ("%s downloaded!\n"%file_name) 
  
    print ("All videos downloaded!")
    return
  
  
if __name__ == "__main__": 
  
    # getting all video links 
    video_links = get_video_links() 
  
    # download all videos 
    download_video_series(video_links)

Last updated