See excel file SS The data looks as in image in csv file
This is what I have written till now to analyze reviews from IMDB.
First it fetches the reviews from imdb website (top 250 movies).
Then fetches the movie links, reviews links, extracts text from the reviews and stores it in a dictionary data format with movie_name: movie review format.
In the last step, I am able to print the Movie_Name: Movie review on the console. But when I write to CSV file it gives either errors or writes just incorrect data to CSV file.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import csv
import requests
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
url = input('Enter - ')
while (True):
try:
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, "html.parser")
container = soup.find_all('td', class_ = 'titleColumn')
break
except:
print("Please enter a valid url:")
url = input('Enter - ')
def movies_list():
movie_names = []
movies = container[:100] #here we get the top 50 movies we want
for movie in movies:
name = movie.find('a').text
movie_names.append(name)
return movie_names
#print(movie_names)
def movie_links_list():
movie_links = []
movies = container[:100]
for movie in movies:
tag = movie.find('a')`enter code here`
link = tag.get('href', None)
movie_links.append(link)
for i in range(len(movie_links)):
movie_links[i] = 'https://www.imdb.com/'+ movie_links[i]
return movie_links
def review_link_list(movie_links):
review_links = []
for movie_link in movie_links:
title_pos = movie_link.find('title')
nxt_slash = movie_link.find('/', title_pos)
nxt2_slash = movie_link.find('/', nxt_slash+1)
review_link = movie_link[:title_pos-1] + movie_link[title_pos:nxt2_slash+1] + "reviews?ref_=tt_urv"
review_links.append(review_link)
return review_links
def get_reviews(review_links):
movie_names=movies_list()
review_dict={}
for i in range(len(review_links)):
movie_name=movie_names[i]
movie_reviews=[]
review_page = requests.get(review_links[i], headers = headers)
soup = BeautifulSoup(review_page.content, "html.parser")
tag = soup.find_all('div', class_ = 'content') #find_all to return a list
top_50= tag[:50]
for j in top_50:
try:
review=j.select('div.show-more__control')[0].text
except:
continue
movie_reviews.append(review)
review_dict[movie_name]=movie_reviews
return review_dict
file= "abc.csv"
with open(file ,'w') as csvfile:
for i in range(len(movies)):
csvwriter = csv.writer(csvfile)
Name=movies[i]
Review = reviews_dict[Name]
try:
csvwriter.writerow(Review)
except:
csvwriter.writerow("Review does not exist")
you need to open the file and write a list with the data
import csv
dict = {"mykey":10}
with open("mydata.csv", 'a') as file:
writer = csv.writer(file)
for key, value in dict.items():
data = [key, value]
writer.writerow(data)
in the csv file "mydata.csv" you will no get
mykey,10
When using the 'a' as an args in open you can append data to the file so not to over write old data
Related
I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code:
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
page = soup.find_all('p')
for i in page:
i = i.text
with open('page.csv', 'a', encoding = 'utf8', newline='') as f:
thewriter = writer(f)
thewriter.writerow(i)
You can use regex to pull out what you need:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'divContent'})
p_list = content.find_all('p')
rows = []
for p in p_list:
string = p.text
text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups()
name = text[0]
edu = text[2]
phone = text[4]
email = text[5]
row = {
'name':name,
'education':edu,
'phone':phone,
'email':email}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('page.csv', index=False)
I have two different Python files. Each of them scraps some data from websites, and they print the data into a docx file. However, I want to compound them and print the all data to one single docx file. For example:
Headline from file1
Data from file1
Data from file1
Headline from file2
Data from file2
Data from file2
They should be in the same document, but I really couldn't figure it out.. I'm a total beginner, so can you show me how to do that? I would really appreciate it. Thanks in advance for all of your help.
Here are the codes:
code1:
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
import requests
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.``3945.88 Safari/537.37"
url = "https://www.brookings.edu/events/"
data = requests.get(url, headers={"User-Agent": user_agent})
soup = BeautifulSoup(data.text, "lxml")
document = Document()
heading = document.add_heading().add_run("Brookings Institute")
heading.font.name = "Cambria"
heading.font.size = Pt(14)
events = soup.find_all("article", class_ = "archive-view archive-event event-standard-view past")
for event in events:
event_name = event.find("h4", class_ = "title")
link = event.find("a", class_ = "event-content")
try:
print(event_name.text)
document.add_paragraph(event_name.text, style='List Bullet')
print(link['href'])
document.add_paragraph(link['href'])
except:
continue
document.save('demo.docx')
Code2:
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
import requests
url = 'https://www.newamerica.org/api/event/?time_period=past&page_size=12&page=1&story_image_rendition=small'
r = requests.get(url)
data = r.json()
document = Document()
heading = document.add_heading().add_run("New America")
heading.font.name = "Cambria"
heading.font.size = Pt(14)
for i in data['results']:
title = i['title']
link = i['url']
try:
print(f'Title: {title}\nURL: {link}\n\n')
document.add_paragraph({title}, style='List Bullet')
document.add_paragraph({link}, style='List Bullet')
except:
continue
document.save('demo.docx')
Frankly, it could be simpler to put all code in one file and modify it.
If you have to use it in two files then you should reorganize it to keep code in functions
script.py
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
def get_items():
items = []
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.``3945.88 Safari/537.37"
url = "https://www.brookings.edu/events/"
data = requests.get(url, headers={"User-Agent": user_agent})
soup = BeautifulSoup(data.text, "lxml")
events = soup.find_all("article", class_ = "archive-view archive-event event-standard-view past")
for event in events:
try:
name = event.find("h4", class_="title").text
link = event.find("a", class_="event-content")['href']
items.append( [name, link] )
print('name:', name)
print('link:', link)
print('---')
except Exception as ex:
print('Exception:', Exception)
return items
def add_items(document, items):
heading = document.add_heading().add_run("Brookings Institute")
heading.font.name = "Cambria"
heading.font.size = Pt(14)
for text, link in items:
document.add_paragraph(text, style='List Bullet')
document.add_paragraph(link)
def main():
document = Document()
items = get_items()
add_items(document, items)
document.save('demo.docx')
if __name__ == '__main__':
main()
script2.py
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
def get_items():
items = []
url = 'https://www.newamerica.org/api/event/?time_period=past&page_size=12&page=1&story_image_rendition=small'
r = requests.get(url)
data = r.json()
for item in data['results']:
try:
title = item['title']
link = item['url']
items.append( [title, link] )
print('title:', title)
print('link:', link)
print('---')
except Exception as ex:
print('Exception:', ex)
return items
def add_items(document, items):
heading = document.add_heading().add_run("New America")
heading.font.name = "Cambria"
heading.font.size = Pt(14)
for text, link in items:
document.add_paragraph(text, style='List Bullet')
document.add_paragraph(link, style='List Bullet')
def main():
document = Document()
items = get_items()
add_items(document, items)
document.save('demo.docx')
if __name__ == '__main__':
main()
And you still can run every script as separated program.
But you can also import it to other file to run only selected functions and write all in one file
from docx import Document
import script1
import script2
def main():
document = Document()
items = script1.get_items()
script1.add_items(document, items)
items = script2.get_items()
script2.add_items(document, items)
document.save('demo.docx')
if __name__ == '__main__':
main()
Because in both scripts I use the same functions so I can later use for-loop
from docx import Document
import script1
import script2
def main():
document = Document()
for s in [script1, script2]:
items = s.get_items()
s.add_items(document, items)
document.save('demo.docx')
if __name__ == '__main__':
main()
Refactor:
Pseudo code:
Imports
Create document
Get info from site 1
Put info from site 1 in document
Get info from site 2
Put info from site 2 in document
Write document
below is my code I want to scrape the website and store value in the excel-
**
import pandas as pd
import requests
from bs4 import BeautifulSoup
from openpyxl import workbook
Name = []
Mob = []
Add = []
E_mail = []
website = []
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}
url = requests.get("www.example.com", headers=headers)
soup = BeautifulSoup(url.content, 'html.parser')
travel_name = soup.findAll(attrs={'class': 'list-group-item'})
for name in travel_name:
for a in name.findAll('a', attrs={"class": "text-warning"}):
user = a.text
Name.append(user)
pList = name.findAll('p', attrs={"class": "mb-2 text-truncate"})
for p in pList:
# print(p.text)
if p.text.find("Contact:") != -1:
contact = str.replace(p.text, "Contact:", "")
Mob.append(contact)
# print(contact)
if p.text.find("Location:") != -1:
location = str.replace(p.text, "Location:", "")
Add.append(location)
# print(location)
if p.text.find("Email:") != -1:
email = str.replace(p.text, "Email:", "")
E_mail.append(email)
# print(email)
if p.text.find("Website:") != -1:
web = str.replace(p.text, "Website:", "")
website.append(web)
**
I want to store value in excel column-wise. I tried by df = pd.DataFrame() but I am failing
[Name, Mob, Add, E_mail, website]
Follow below pattern hope you understand if not feel free to ask for more explanation:
data=[]
for name in travel_name:
dict_={} #i.e create a dict for each item i.e it will be representing a row in the excel
name= [a.text for a in name.findAll('a', attrs={"class": "text-warning"})]
contact=(code to extract the value like in name)
email=(code to extract the value like in name)
website=(code to extract the value like in name)
data.append(dict) #i.e append each dictionary(later to be row in excel) into a list
df=pd.DataFrame(data)
df.to_csv('data',index=False)
Here name,contact,email,website will be the names of the columns and every iteration will create a row for these columns against your data.
I'm trying to get leads from yelp using python and beautifulsoup but I'm not able to catch the fields for phone name address and wesbite (optional).
I'm getting the following error here is my code I try to search and found different solution but they didn't work for me.
Here is my code
from bs4 import BeautifulSoup
import requests
import sys
import csv
import requests, re, json
## Get the min and max page numbers
pagenum=0
maxpage =0
## loop go thourgh the pages
while pagenum <= maxpage:
newsu =pagenum
newsu = str(newsu)
csvname = 'cardealers'+newsu+'.csv';
csvfile = open(csvname , 'w',encoding="utf-8")
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Business name', 'phone' , 'address'] )
headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum), headers = headers)
p = re.compile(r'PRELOADED_STATE__ = (.*?);')
data = json.loads(p)
print(data)
pagenum =pagenum+1
for item in data['searchResult']['results']:
name = item['businessName']
phone=item['phone']
address= ([item['address'],item['city'], item['state'], item['postalcode']])
csv_writer.writerow([name, phone , address ])
print(name)
csvfile.close()
here is the error message.
Traceback (most recent call last): File
"\Python\Python36\scraper\scrape.py", line 22, in
data = json.loads(p) File "\Python\Python36\lib\json__init__.py", line 348, in loads
'not {!r}'.format(s.class.name)) TypeError: the JSON object must be str, bytes or bytearray, not 'SRE_Pattern'
you are trying to read in a string that is not json format.
Essentially, this is what you are doing:
data = json.loads('THIS IS JUST A STRING. NOT IN A JSON FORMAT')
so you want to do something like: data = json.loads(p.findall(r.text))
You actually need to pull that out from the html. The other MAJOR issue though is that is not even within the html you are pulling...so it will always return an empty list.
Also, you are not iterating through anything. You start at pagenum=0, with maxpage page=0 and run while pagenum<=maxpage which means it's going to run forever.
The json structure with the data is in the html, but looks like it's within the Comments. So you'll need to parse that instead.
Also, why do:
newsu =pagenum
newsu = str(newsu)
simply do newsu = str(pagenum). Do you really want a seperate file for each iteration? I just put it into 1 file:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import math
## Get the min and max page numbers
pagenum=0
results = pd.DataFrame()
with requests.Session() as s:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}
url = 'https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum)
r = s.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if '<!--{' in script.text:
jsonStr = script.text.split('<!--')[-1].split('-->')[0]
jsonData = json.loads(jsonStr)
totalPages = jsonData['searchPageProps']['searchResultsProps']['paginationInfo']['totalResults']
resultsPerPage = jsonData['searchPageProps']['searchResultsProps']['paginationInfo']['resultsPerPage']
totalPages = math.ceil(totalPages/resultsPerPage)
## loop go through the pages
for pagenum in range(0,totalPages+1):
url = 'https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum)
r = s.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if '<!--{' in script.text:
jsonStr = script.text.split('<!--')[-1].split('-->')[0]
jsonData = json.loads(jsonStr)
for each in jsonData['searchPageProps']['searchResultsProps']['searchResults']:
if 'searchResultBusiness' in each.keys():
busiName = each['searchResultBusiness']['name']
phone = each['searchResultBusiness']['phone']
address = each['searchResultBusiness']['formattedAddress']
temp_df = pd.DataFrame([[busiName, phone, address]], columns=['Business name', 'phone' , 'address'])
results = results.append(temp_df, sort=False).reset_index(drop=True)
print ('Aquired page: %s' %pagenum)
results.to_csv('cardealers.csv', index=False)
I'm new to python and currently writing an application that scrapes data off the web. It's mostly done, there is only a little problem left with encoding. The site is encoded in ISO-8859-1, but when I try to html.decode('iso-8859-1'), it doesn't do anything.
If you run the program, use 50000 and 50126 for PLZs and you'll see what I mean in the output. It would be awesome if someone could help me out.
import urllib.request
import time
import csv
import operator
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult)
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(plz_von, plz_bis):
database = []
links = []
#The search parameters
params = {
'name_ff': '',
'strasse_ff': '',
'plz_ff': plz_von,
'plz_ff2': plz_bis,
'ort_ff': '',
'bundesland_ff': '',
'land_ff': 'DE',
'traeger_ff': '',
'Dachverband_ff': '',
'submit2' : 'Suchen'
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('utf-8')
request = urllib.request.Request(
"http://www.altenheim-adressen.de/schnellsuche/suche1.cfm",
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
html = doRequest(request)
h = html.decode('iso-8859-1')
soup = BeautifulSoup(h)
for link in soup.find_all('a'):
database.append(link.get('href'))
#Remove the first Element ('None') to avoid Attribute Errors
database.pop(0)
for item in database:
if item.startswith("suche"):
links.append(item)
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Found", j, "results, collecting data.")
for item in links:
adresses.append(getContactInfoFromPage(item, i, j))
i = i + 1
time.sleep(0.1)
print("All done.")
return adresses
#A method to scrape the contact info from the search result
def getContactInfoFromPage(page, i, j):
name = ''
straße = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
request = urllib.request.Request("http://www.altenheim-adressen.de/schnellsuche/" + page)
#request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("Content-Type", "text/html;charset=UTF-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
print("(" , i , "/" , j , ") Making request...")
soup = doRequest(request)
print("Done.")
findeName = soup.findAll('b')
name = findeName[2]
name = name.string.split('>')
data.append(name[0])
straße = getFieldValue(soup, "Straße")
data.append(straße)
ort = getFieldValue(soup, "Ort")
(plz, stadt) = ort.split(' ', 1)
data.append(plz)
data.append(stadt)
telefon = getFieldValue(soup, "Telefon")
data.append(telefon)
mail = getFieldValue(soup, "EMail")
data.append(mail)
url = getFieldValue(soup, "Internetadresse")
data.append(url)
return data
#Strips the text from the given field's sibling
def getFieldValue(soup, field):
field_label = soup.find('td', text=field + ':')
return field_label.find_next_sibling('td').get_text(strip=True)
#The main input/output function
def inputOutput():
#PLZ is German for zip-code and consists of a five-digit number
#The program passes the numbers to the servers, and the server
#returns all search results between the two numbers
plz_von = input("Please enter first PLZ: ")
plz_bis = input("Please enter second PLZ: ")
links = getLinksFromSearch(plz_von, plz_bis)
#Checks if the search yielded any results
if len(links) > 0:
data = searchOnLinks(links)
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
print("The search yielded no results.")
inputOutput()
Your doRequest() function returns a BeautifulSoup object, you cannot decode that object. Just use it directly:
soup = doRequest(request)
You don't need to decode the response at all; BeautifulSoup uses both hints in the HTML (<meta> headers) as well as statistical analysis to determine the correct input encoding.
In this case the HTML document claims it is Latin-1:
<meta name="content-type" content="text/html; charset=iso-8859-1">
The response doesn't include a character set in the Content-Type header either, so this is a case of a misconfigured server. You can force BeautifulSoup to ignore the <meta> header with:
soup = BeautifulSoup(requestResult, from_encoding='utf8')