IMDB Data Scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup
response = urlopen("http://www.imdb.com/list/ls055592025/")
soup = BeautifulSoup(response.read(), "html.parser")
for div in soup.findAll('div', {'class': 'lister-item-content'}):
for b in div.findAll('h3'):
for a in b.findAll('a'):
print(a.text)
links=[]
titles=[]
for div in soup.findAll('div', {'class': 'lister-item-content'}):
for b in div.findAll('h3'):
for a in b.findAll('a'):
titles.append(a.text)
links.append('http://www.imdb.com'+a['href'])
print(str(len(links)))
print(links[0])
synopses=[]
for i in links:
response = urlopen(str(i)+"synopsis?ref_=tt_stry_pl")
soup = BeautifulSoup(response.read(), "html.parser")
for div in soup.findAll('ul',{'id':'plot-synopsis-content'}):
synopses.append(div.text)
import json
with open('titles.txt', 'w') as filehandle:
json.dump(titles, filehandle)
with open('links.txt', 'w') as filehandle:
json.dump(links, filehandle)
with open('synopses.txt', 'w') as filehandle:
json.dump(synopses, filehandle)
Last updated