I am using BeautifulSoup, I am practicing getting website content.
But, duplicate in the output, starting from the second, each repeating itself.
I tried to modify the code in the for loop, but it will still repeat.
#coding:utf-8
import lxml
import json
import re
import requests
from bs4 import BeautifulSoup
def the_url(url):
user_agent = "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
headers = {"User-Agent":user_agent}
r = requests.get(url, headers=headers)
return r.text
def get_text(page_html):
the_web = BeautifulSoup(page_html, 'html.parser')
base_url = "https://cn.reuters.com"
list_div = the_web.find('div', {"id": 'chinaNews'})
list_li = list_div.find_all('li')
for t in list_li:
the_dict = {}
a = t.find('a')
excerpt = t.find('div', {"class": 'smalltext'})
if a:
the_dict['link'] = base_url + a.get('href')
the_dict['title'] = a.get_text()
if excerpt:
the_dict['excerpt'] = excerpt.get_text()
result_list.append(the_dict)
def save_to_json(result):
s = json.dumps(result, indent = 4, ensure_ascii = False)
# json file
with open('text.json', 'w', encoding = 'utf-8') as f:
f.write(s)
def main():
for i in range(2):
i = i + 1
url = 'http://cn.mobile.reuters.com/category/chinaNews?p={}'.format(i)
page_html = the_url(url)
get_text(page_html)
save_to_json(result_list)
if __name__ == '__main__':
result_list = []
main()
I want to remove the duplicates in the output.
You can check if the value is already in the dict:
if the_dict and not any(r['link'] == the_dict['link'] for r in result_list):
# No dict with this link exist in the result_list
result_list.append(the_dict)
Here is the following check-test in your get_text method:
def get_text(page_html):
the_web = BeautifulSoup(page_html, 'html.parser')
base_url = "https://cn.reuters.com"
list_div = the_web.find('div', {"id": 'chinaNews'})
list_li = list_div.find_all('li')
for t in list_li:
the_dict = {}
a = t.find('a')
excerpt = t.find('div', {"class": 'smalltext'})
if a:
the_dict['link'] = base_url + a.get('href')
the_dict['title'] = a.get_text()
if excerpt:
the_dict['excerpt'] = excerpt.get_text()
if the_dict and not any(r['link'] == the_dict['link'] for r in result_list):
result_list.append(the_dict)
Related
See excel file SS The data looks as in image in csv file
This is what I have written till now to analyze reviews from IMDB.
First it fetches the reviews from imdb website (top 250 movies).
Then fetches the movie links, reviews links, extracts text from the reviews and stores it in a dictionary data format with movie_name: movie review format.
In the last step, I am able to print the Movie_Name: Movie review on the console. But when I write to CSV file it gives either errors or writes just incorrect data to CSV file.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import csv
import requests
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
url = input('Enter - ')
while (True):
try:
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, "html.parser")
container = soup.find_all('td', class_ = 'titleColumn')
break
except:
print("Please enter a valid url:")
url = input('Enter - ')
def movies_list():
movie_names = []
movies = container[:100] #here we get the top 50 movies we want
for movie in movies:
name = movie.find('a').text
movie_names.append(name)
return movie_names
#print(movie_names)
def movie_links_list():
movie_links = []
movies = container[:100]
for movie in movies:
tag = movie.find('a')`enter code here`
link = tag.get('href', None)
movie_links.append(link)
for i in range(len(movie_links)):
movie_links[i] = 'https://www.imdb.com/'+ movie_links[i]
return movie_links
def review_link_list(movie_links):
review_links = []
for movie_link in movie_links:
title_pos = movie_link.find('title')
nxt_slash = movie_link.find('/', title_pos)
nxt2_slash = movie_link.find('/', nxt_slash+1)
review_link = movie_link[:title_pos-1] + movie_link[title_pos:nxt2_slash+1] + "reviews?ref_=tt_urv"
review_links.append(review_link)
return review_links
def get_reviews(review_links):
movie_names=movies_list()
review_dict={}
for i in range(len(review_links)):
movie_name=movie_names[i]
movie_reviews=[]
review_page = requests.get(review_links[i], headers = headers)
soup = BeautifulSoup(review_page.content, "html.parser")
tag = soup.find_all('div', class_ = 'content') #find_all to return a list
top_50= tag[:50]
for j in top_50:
try:
review=j.select('div.show-more__control')[0].text
except:
continue
movie_reviews.append(review)
review_dict[movie_name]=movie_reviews
return review_dict
file= "abc.csv"
with open(file ,'w') as csvfile:
for i in range(len(movies)):
csvwriter = csv.writer(csvfile)
Name=movies[i]
Review = reviews_dict[Name]
try:
csvwriter.writerow(Review)
except:
csvwriter.writerow("Review does not exist")
you need to open the file and write a list with the data
import csv
dict = {"mykey":10}
with open("mydata.csv", 'a') as file:
writer = csv.writer(file)
for key, value in dict.items():
data = [key, value]
writer.writerow(data)
in the csv file "mydata.csv" you will no get
mykey,10
When using the 'a' as an args in open you can append data to the file so not to over write old data
αԋɱҽԃ αмєяιcαη helped me in constructing this code for scraping reviews from this page where reviews are dynamically loaded. I then tried to adjust it so that it scrapes not just the comment-body, but also the commentors' names, dates, and ratings, and for the code to save the extracted data into an excel file. But I failed to do so. Could someone help me in adjusting the code correctly?
This is the code from αԋɱҽԃ αмєяιcαη
import requests
from bs4 import BeautifulSoup
import math
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
print(com.text[5:com.text.find(r"\n", 3)])
Main()
This is the code I adjusted but then got errors that I couldn't resolve
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd
df = pd.DataFrame()
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
dateElements = []
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"user-review\"'):
names.append(article.find('div', attrs={'class': 'name'}).text.strip())
try:
bodies.append(article.find('div', attrs={'class': 'comment-body'}).text.strip())
except:
bodies.append('NA')
try:
ratings.append(article.find('meta', attrs={'itemprop': 'ratingValue'})['content'])
except:
ratings.append('NA')
dateElements.append(article.find('div', attrs={'class': 'comment-date'}).text.strip())
print(com.text[5:com.text.find(r"\n", 3)])
temp_df = pd.DataFrame(
{'User Name': names, 'Body': bodies, 'Rating': ratings, 'Published Date': dateElements})
df = df.append(temp_df, sort=False).reset_index(drop=True)
Main()
df.to_csv('Allure10.csv', index=False, encoding='utf-8')
print ('excel done')
import requests
from bs4 import BeautifulSoup
import math
import csv
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
names = []
dates = []
comments = []
rating = []
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
comments.append(com.text[5:com.text.find(r"\n", 3)])
for name in soup.findAll("div", class_=r'\"name\"'):
names.append(name.text[:name.text.find(r"<\/div>", 1)])
for date in soup.findAll("div", class_=r'\"comment-date\"'):
dates.append(date.text[:date.text.find(r"<\/div>", 1)])
for rate in soup.findAll("meta", itemprop=r'\"ratingValue\"'):
rating.append(rate.get("content")[2:-3])
return zip(names, dates, rating, comments)
def Save():
data = Main()
with open("oka.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Dates", "Rating", "Comments"])
writer.writerows(data)
Save()
Output: check-online
I am trying to crawl multiple pages of a website. But the program can only crawl the first page.
import requests
from bs4 import BeautifulSoup
import re
import json
import time
def make_soup(url):
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
jsonData = pattern.search(script.text).group(1)
pattern_number = re.compile(r'\"[0-9]{9,12}\":(\{\"data\":\{\"cachedFilters\":(.*?)\}\}),\"[0-9]{9,11}\"')
jsonData2 = pattern_number.search(jsonData).group(1)
dictData = json.loads(jsonData2)
return dictData
def get_reviews(dictData):
""" Return a list of five dicts with reviews.
"""
all_dictionaries = []
for data in dictData['data']['locations']:
for reviews in data['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewdate"] = reviews['createdDate']
userProfile = reviews['userProfile']
review_dict["author"] = userProfile['displayName']
all_dictionaries.append(review_dict)
return all_dictionaries
def main():
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list = get_reviews(dictData) # list with five dicts
#print(review_list)
page_number = 5
while page_number <= 260: # number in the URL
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list2 = get_reviews(dictData)
print(review_list2)
page_number += 5
time.sleep(0.5)
if __name__ == "__main__":
main()
And I'm not sure if I can crawl multiple pages with this URL. On the website there are 54 pages, but in the URL I always have to add the number 5, like this:
Page 1
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS
Page2
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or5-Coronado_Hotel-Zurich.html#REVIEWS
Page3
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or10-Coronado_Hotel-Zurich.html#REVIEWS
I don't know if this is a good idea.
Do you have any suggestions? Thank you in advance!
You assing new url to next_url but you use url to read page.
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
You have to rename variable
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
There are quite similar scenarios regarding this; but I've been comparing with others.
Getting from Clustered Nodes etc. But somehow; I'm unsure why my for loop isn't iterating and grabbing the text from other elements but only from the first element of the node.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': 'section-trending-search-list'}):
topic = container.select_one(
'div._1waRmo')
if topic:
print(1)
d = {
'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': '_25qBG5'}):
topic = container.select_one('div._1waRmo')
if topic:
d = {'Titles': topic.text.replace("\n", "")}
l.append(d)
return d
except:
d = None
findDiv()
print(l)
Output:
[{'Titles': 'school backpack'}, {'Titles': 'oppo case'}, {'Titles': 'baby chair'}, {'Titles': 'car holder'}, {'Titles': 'sling beg'}]
Again I suggest you use selenium. If you run this again you will see that you will get a different set of 5 dictionaries within the list. Every time you are making a request they are giving 5 random trending items. But they do have a 'change' button. If you use selenium, you might be able to just click that and keep scraping all trending items.
Try this:
toplevel is finding the root of the options, then we find all divs under that.
I hope this is what you want.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
toplevel = html_soup.find('._25qBG5')
for container in toplevel.find_all('div'):
topic = container.select_one('._1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
This enumerates fine with a local file. When I tried with the url given, the website wasn't returning the html you show.
from requests import get
from bs4 import BeautifulSoup
url = 'path_in_here\\test.html'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
example = open(url,"r")
text = example.read()
#response = get(url, headers=headers)
#html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup = BeautifulSoup(text, 'html.parser')
print (text)
def findDiv():
#try:
print("finding toplevel")
toplevel = html_soup.find("div", { "class": "_25qBG5"} )
print ("found toplevel")
divs = toplevel.findChildren("div", recursive=True)
print("found divs")
for container in divs:
print ("loop")
topic = container.select_one('.1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
#except:
# d = None
# print ("error")
findDiv()
print(l)
I have some problems with web scraping, here is my code:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
After this request I am getting different result than I put this:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
into my web browser, what is causing this problem? Is there is something related to page redirection or something?
I can see multiple things that will cause you to get different results:
You initiate a POST not a GET. Lookup params for requests.get.
They use javascript to modify the page.