Web Scrapping

Data Scraping using Beautiful Soap

Import beautiful soap
Make a GET request to fetch page data
Parse HTML
Filter relevant parts

Step - 1: Fetch page data in HTML form

!pip install bs4
from urllib.request import urlopen
url = "https://en.wikipedia.org/wiki/Bengal"
data = urlopen(url)
print(type(data))
dhtml = data.read()
print(dhtml)

Step-2: Filter Page Data

from bs4 import BeautifulSoup as soup
dsoup = soup(dhtml, 'html.parser')
print(type(dsoup))
dsoup.findAll('h1',{})
#findAll returns data as a list
table = dsoup.findAll('table',{'class':'sortable wikitable'})
print(len(table))

#printing 1st table in case is there are multiple tables with class name sortable wikitable
#but here we have only 1 table so table and dtable will return same table 
dtable = table[0]
print(dtable)

heading = dtable.findAll('th')
#check html code and find headers
#th stands for table heading
print(len(heading))

#print first heading 
print(heading[0].text)

#print all the titles
ctitles = [ct.text[:-1] for ct in heading]
print(ctitles)

# tr stands for table rows
# here we are finding table rows
rowdata = dtable.findAll('tr')[1:]
print(rowdata)
print(len(rowdata))

# 'td' is to find all the table data
# here we are finding first row data-- rowdata[0]
firstrow = rowdata[0].findAll('td',{})
for d in firstrow:
       print(d.text)

# finding all data rows
r1 = []
for rows_data in rowdata:
    r2 = []
    row_data = rows_data.findAll("td",{})
    for idx,row in enumerate(row_data):
        if idx !=1 and idx != 0 and idx ==2:
            r2.append(row.text[1:-1])
        elif idx !=1 and idx != 0 and idx !=2:
            r2.append(row.text[:-1])
        else:
            r2.append(row.text)
    
    r1.append(r2)
print(r1)

Understanding the concept of idx,row in enumerate(row_data):

abc = ['I', 'love', 'python']
for i,n in enumerate(abc):
    print(i)
    print(n)

love

python

Copying the Data to CSV file:

# importing the csv module 
import csv 

# field names 
fields = ['Rank', 'City', 'Country', 'Population', 'Images'] 

filename = "C:/Users/Shalki/Desktop/python/Book1.csv"

# writing to csv file 
with open(filename, 'w') as csvfile: 
	# creating a csv writer object 
	csvwriter = csv.writer(csvfile) 
	
	# writing the fields 
	csvwriter.writerow(fields) 
	
	# writing the data rows 
	csvwriter.writerows(r1)

Downloading files from web using Python

# imported the requests library
import requests
image_url = "https://en.wikipedia.org/wiki/Image#/media/File:Image_created_with_a_mobile_phone.png"
r = requests.get(image_url)
with open("shreyansh.png",'wb') as f:
    f.write(r.content)

Note: Image can be seen in compiler (jupyter) local host

Download large files:

import requests
file_url = "http://codex.cs.yale.edu/avi/db-book/db4/slide-dir/ch1-2.pdf"
r = requests.get(file_url, stream = True)
with open("python.pdf","wb") as pdf:
 for chunk in r.iter_content(chunk_size=1024):
 # writing one chunk at a time to pdf file
 if chunk:
 pdf.write(chunk)

Downloading Videos:

import requests 
from bs4 import BeautifulSoup 
  
archive_url = "http://www-personal.umich.edu/~csev/books/py4inf/media/"
  
def get_video_links(): 
      
    # create response object 
    r = requests.get(archive_url) 
      
    # create beautiful-soup object 
    soup = BeautifulSoup(r.content,'html5lib') 
      
    # find all links on web-page 
    links = soup.findAll('a') 
  
    # filter the link sending with .mp4 
    video_links = [archive_url + link['href'] for link in links if link['href'].endswith('mp4')] 
  
    return video_links 
  
  
def download_video_series(video_links): 
    print(video_links)
    for link in video_links: 
        file_name = link.split('/')[-1]    
        print ("Downloading file:%s"%file_name) 
          
        # create response object 
        r = requests.get(link, stream = True) 
          
        # download started 
        with open(file_name, 'wb') as f: 
            for chunk in r.iter_content(chunk_size = 1024*1024): 
                if chunk: 
                    f.write(chunk) 
                    print ("%s downloaded!\n"%file_name) 
  
    print ("All videos downloaded!")
    return
  
  
if __name__ == "__main__": 
  
    # getting all video links 
    video_links = get_video_links() 
  
    # download all videos 
    download_video_series(video_links)

PreviousThreading NextNews Web Scraping

Last updated 4 years ago

Was this helpful?

from bs4 import BeautifulSoup as soup dsoup = soup(dhtml, 'html.parser') print(type(dsoup)) dsoup.findAll('h1',{}) #findAll returns data as a list table = dsoup.findAll('table',{'class':'sortable wikitable'}) print(len(table)) #printing 1st table in case is there are multiple tables with class name sortable wikitable #but here we have only 1 table so table and dtable will return same table dtable = table[0] print(dtable)

# finding all data rows r1 = [] for rows_data in rowdata: r2 = [] row_data = rows_data.findAll("td",{}) for idx,row in enumerate(row_data): if idx !=1 and idx != 0 and idx ==2: r2.append(row.text[1:-1]) elif idx !=1 and idx != 0 and idx !=2: r2.append(row.text[:-1]) else: r2.append(row.text) r1.append(r2) print(r1)

# importing the csv module import csv # field names fields = ['Rank', 'City', 'Country', 'Population', 'Images'] filename = "C:/Users/Shalki/Desktop/python/Book1.csv" # writing to csv file with open(filename, 'w') as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the fields csvwriter.writerow(fields) # writing the data rows csvwriter.writerows(r1)

# imported the requests library import requests image_url = "https://en.wikipedia.org/wiki/Image#/media/File:Image_created_with_a_mobile_phone.png" r = requests.get(image_url) with open("shreyansh.png",'wb') as f: f.write(r.content)

import requests file_url = "http://codex.cs.yale.edu/avi/db-book/db4/slide-dir/ch1-2.pdf" r = requests.get(file_url, stream = True) with open("python.pdf","wb") as pdf: for chunk in r.iter_content(chunk_size=1024): # writing one chunk at a time to pdf file if chunk: pdf.write(chunk)

import requests from bs4 import BeautifulSoup archive_url = "http://www-personal.umich.edu/~csev/books/py4inf/media/" def get_video_links(): # create response object r = requests.get(archive_url) # create beautiful-soup object soup = BeautifulSoup(r.content,'html5lib') # find all links on web-page links = soup.findAll('a') # filter the link sending with .mp4 video_links = [archive_url + link['href'] for link in links if link['href'].endswith('mp4')] return video_links def download_video_series(video_links): print(video_links) for link in video_links: file_name = link.split('/')[-1] print ("Downloading file:%s"%file_name) # create response object r = requests.get(link, stream = True) # download started with open(file_name, 'wb') as f: for chunk in r.iter_content(chunk_size = 1024*1024): if chunk: f.write(chunk) print ("%s downloaded!\n"%file_name) print ("All videos downloaded!") return if __name__ == "__main__": # getting all video links video_links = get_video_links() # download all videos download_video_series(video_links)