convert multiple strings from selenium and Beautiful soup to CSV file - python

I have this scraper I am trying to export as a csv file in Google Colab. I received the scraped information as a string value, but I cannot convert it to a csv. I want each scraped attribute "title", "size", etc to populate a column in a csv file. I have ran the strings through Beautiful soup to remove the HTML formatting. Please see my code below to help.
import pandas as pd
import time
import io
from io import StringIO
import csv
#from google.colab import drive
#drive.mount('drive')
#Use new Library (kora.selenium) to run chromedriver
from kora.selenium import wd
#Import BeautifulSoup to parse HTML formatting
from bs4 import BeautifulSoup
wd.get("https://www.grailed.com/sold/EP8S3v8V_w") #Get webpage
ScrollNumber=round(200/40)+1
for i in range(0,ScrollNumber):
wd.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
#--------------#
#Each new attribute will have to found using XPATH because Grailed's website is written in Javascript (js.react) not HTML
#Only 39 results will show because the JS page is infinite scroll and selenium must be told to keep scrolling.
follow_loop = range(2, 200)
for x in follow_loop:
#Title
title = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
title += str(x)
title += "]/a/div[3]/div[2]/p"
title = wd.find_elements_by_xpath(title)
title = str(title)
#Price
price = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
price += str(x)
price += "]/div/div/p/span"
price = wd.find_elements_by_xpath(price)
price = str(price)
#Size
size = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
size += str(x)
size += "]/a/div[3]/div[1]/p[2]"
size = wd.find_elements_by_xpath(size)
size = str(size)
#Sold
sold = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
sold += str(x)
sold += "]/a/p/span"
sold = wd.find_elements_by_xpath(sold)
sold = str(sold)
#Clean HTML formatting using Beautiful soup
cleantitle = BeautifulSoup(title, "lxml").text
cleanprice = BeautifulSoup(price, "lxml").text
cleansize = BeautifulSoup(size, "lxml").text
cleansold = BeautifulSoup(sold, "lxml").text

This was a lot of work lol
from selenium import webdriver
import time
import csv
driver = webdriver.Chrome()
driver.get("https://www.grailed.com/sold/EP8S3v8V_w")
scroll_count = round(200 / 40) + 1
for i in range(scroll_count):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
time.sleep(3)
titles = driver.find_elements_by_css_selector("p.listing-designer")
prices = driver.find_elements_by_css_selector("p.sub-title.sold-price")
sizes = driver.find_elements_by_css_selector("p.listing-size.sub-title")
sold = driver.find_elements_by_css_selector("div.-overlay")
data = [titles, prices, sizes, sold]
data = [list(map(lambda element: element.text, arr)) for arr in data]
with open('sold_shoes.csv', 'w') as file:
writer = csv.writer(file)
j = 0
while j < len(titles):
row = []
for i in range(len(data)):
row.append(data[i][j])
writer.writerow(row)
j += 1
I'm not sure why it makes a newline between every row in the file, but I assume it's not a problem. Also, it's a naïve solution in that it assumes the size of each list is the same, consider using one list and making new lists from the child elements of the parent. Also, I just used Selenium without BeautifulSoup because it's easier for me, but you should learn BS too because it's faster for scraping than Selenium. Happy coding.

Related

While opening a .xlsx file written through python. An error pops up :- File format or file extension is not valid, Verify that file is not corrupted

from selenium import webdriver
import time
from bs4 import BeautifulSoup as Soup
from urllib.request import urlopen
import datetime as dt
import csv
import pandas as pd
driver = webdriver.Firefox(executable_path='C://Downloads//webdrivers//geckodriver.exe')
c1 = 'amazon_data_' + dt.datetime.now().strftime("%d_%b_%y_%I_%M_%p")
d = open(str(c1) + '.csv', 'x', encoding='utf-8')
#d = open(str(c1) + '.xlsx', 'x', encoding='utf-8')
for c in range(1):
a = f'https://www.flipkart.com/search?q=sony+headphones&as=on&as-show=on&otracker=AS_Query_HistoryAutoSuggest_1_4_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_4_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=sony+headphones&requestId=ad797917-16ae-401e-98df-1c79a43d40c3&as-backfill=on&page={c}'
'''
request_response = requests.head(a)
status_code = request_response.status_code
if status_code == 200:
print(True)
else:
print(False)
'''
driver.get(a)
# time.sleep(1)
page_soup = Soup(urlopen(a), 'html5lib')
container = page_soup.find_all('div', {'class': '_4ddWXP'})
for containers in container:
find_url = containers.find('a')['href']
new_url = 'https://www.flipkart.com' + find_url
fetch = driver.get(new_url)
# time.sleep(1)
page_source = driver.page_source
page_soup = Soup(page_source, 'html.parser')
for data in page_soup:
try:
product_name = data.find('span', {'class': 'B_NuCI'}).text.strip()
price = data.find('div', {'class': "_30jeq3 _16Jk6d"}).text.strip()
current_url = new_url
except:
print('Not Available')
# print(product_name, '\n', price, '\n', current_url, '\n')
d.write(product_name + price + current_url + '\n')
Error I got
While trying to save the output data in .xlsx format, It saves the file properly. But while opening it, an error pops out:- The file format of the extension is not valid, verify the file is not corrupted and the file extension matches the format of the file.
Things I tried
When I try to write the output data with .csv it saves properly. But while opening the file, data has some special characters and data is not written in single.
** Output of single cell while writing data through .csv method **
JBL a noise cancellation enabled Bluetooth~
Uploading an Image for better Understanding
Below I'm providing url of an image which has excel output that I got
while fetching data from above script and saving it to .csv file.
Things I want
I want to save this date in .xlsx format with relevant following 3
headers :- product_name, price, URL.
I want all the special characters to be removed so that I get the clean output while writing the data in .xlsx format.
I see few problems:
using open(), write() you can't create xlsx because it has to be file .xml compressed with zip
some data has , which normally is used as separator for columns and you should put data in " " to create columns correctly. Better use module csv or pandas and it will use " " automatically. And this can be your main problem.
you mix selenium with beautifulsoup and sometimes you make mess.
you use for data in page_soup so you get all children on page and run the same code for these elements but you should get values directly from page_soup
I would put all data on list - every item as sublist - and later I would convert it to pandas.DataFrame and save it using to_csv() or to_excel()
I would even use selenium to search element (ie. find_elements_by_xpath) instead of beautifulsoup but I skiped this idea in code.
from selenium import webdriver
import time
from bs4 import BeautifulSoup as BS
import datetime as dt
import pandas as pd
# - before loop -
all_rows = []
#driver = webdriver.Firefox(executable_path='C:\\Downloads\\webdrivers\\geckodriver.exe')
driver = webdriver.Firefox() # I have `geckodriver` in folder `/home/furas/bin` and I don't have to set `executable_path`
# - loop -
for page in range(1): # range(10)`
print('--- page:', page, '---')
url = f'https://www.flipkart.com/search?q=sony+headphones&as=on&as-show=on&otracker=AS_Query_HistoryAutoSuggest_1_4_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_4_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=sony+headphones&requestId=ad797917-16ae-401e-98df-1c79a43d40c3&as-backfill=on&page={page}'
driver.get(url)
time.sleep(3)
soup = BS(driver.page_source, 'html5lib')
all_containers = soup.find_all('div', {'class': '_4ddWXP'})
for container in all_containers:
find_url = container.find('a')['href']
print('find_url:', find_url)
item_url = 'https://www.flipkart.com' + find_url
driver.get(item_url)
time.sleep(3)
item_soup = BS(driver.page_source, 'html.parser')
try:
product_name = item_soup.find('span', {'class': 'B_NuCI'}).text.strip()
price = item_soup.find('div', {'class': "_30jeq3 _16Jk6d"}).text.strip()
print('product_name:', product_name)
print('price:', price)
print('item_url:', item_url)
print('---')
row = [product_name, price, item_url]
all_rows.append(row)
except Exception as ex:
print('Not Available:', ex)
print('---')
# - after loop -
df = pd.DataFrame(all_rows)
filename = dt.datetime.now().strftime("amazon_data_%d_%b_%y_%I_%M_%p.csv")
df.to_csv(filename)
#filename = dt.datetime.now().strftime("amazon_data_%d_%b_%y_%I_%M_%p.xlsx")
#df.to_excel(filename)

Trouble with webscraping, how to NA when no results?

I have several URLs which link to Hotel pages and I would like to scrape some data from it.
I'm using the following this script, but I would like to update it:
data=[]
for i in range(0,10):
url = final_list[i]
driver2 = webdriver.Chrome()
driver2.get(url)
sleep(randint(10,20))
soup = BeautifulSoup(driver2.page_source, 'html.parser')
my_table2 = soup.find_all(class_=['title-2', 'rating-score body-3'])
review=soup.find_all(class_='reviews')[-1]
try:
price=soup.find_all('span', attrs={'class':'price'})[-1]
except:
price=soup.find_all('span', attrs={'class':'price'})
for tag in my_table2:
data.append(tag.text.strip())
for p in price:
data.append(p)
for r in review:
data.append(r)
But here's the problem, tag.text.strip() scrape rating numbers like here :
It will strip the number rating into alone value but some hotels don't have the same amout of ratings. Here's a hotel with 7 ratings, the default number is 8. Some have seven ratings, other six, and so on. So in the end, my dataframe is quite screwed. If the hotel doesn't have 8 ratings, the value will be shifted.
My question is : How to tell the script "if there is a value in this tag.text.strip(i) so put the value but if there isn't put None. And of course made that for the eight value.
I tried several things like :
for tag in my_table2:
for i in tag.text.strip()[i]:
if i:
data.append(i)
else:
data.append(None)
But unfortunately, that goes nowhere, so if you could help to figure out the answer, it would be awesome :)
If that could help you, I put link on Hotel that I'm scraping :
https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1
The number ratings are at the end
Thank you.
A few suggestions:
Put your data in a dictionary. You don't have to assume that all tags are present and the order of the tags doesn't matter. You can get the labels and the corresponding ratings with
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
and then iterate over both lists with zip
move your driver outside of the loop, opening it once is enough
don't use wait but you use Selenium's wait functions. You can wait for a particular element to be present or populated with WebDriverWait(driver, 10).until(EC.presence_of_element_located(your_element)
https://selenium-python.readthedocs.io/waits.html
Cache your scraped HTML code to a file. It's faster for you and politer to the website you are scraping
import selenium
import selenium.webdriver
import time
import random
import os
from bs4 import BeautifulSoup
data = []
final_list = [
'https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1',
'https://www.hostelworld.com/pwa/hosteldetails.php/Be-Ramblas-Hostel/Barcelona/435?from=2020-11-27&to=2020-11-28&guests=1'
]
# load your driver only once to save time
driver = selenium.webdriver.Chrome()
for url in final_list:
data.append({})
# cache the HTML code to the filesystem
# generate a filename from the URL where all non-alphanumeric characters (e.g. :/) are replaced with underscores _
filename = ''.join([s if s.isalnum() else '_' for s in url])
if not os.path.isfile(filename):
driver.get(url)
# better use selenium's wait functions here
time.sleep(random.randint(10, 20))
source = driver.page_source
with open(filename, 'w', encoding='utf-8') as f:
f.write(source)
else:
with open(filename, 'r', encoding='utf-8') as f:
source = f.read()
soup = BeautifulSoup(source, 'html.parser')
review = soup.find_all(class_='reviews')[-1]
try:
price = soup.find_all('span', attrs={'class':'price'})[-1]
except:
price = soup.find_all('span', attrs={'class':'price'})
data[-1]['name'] = soup.find_all(class_=['title-2'])[0].text.strip()
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
assert len(rating_labels) == len(rating_scores)
for label, score in zip(rating_labels, rating_scores):
data[-1][label.text.strip()] = score.text.strip()
data[-1]['price'] = price.text.strip()
data[-1]['review'] = review.text.strip()
The data can then be easily put in a nicely formatted table using Pandas
import pandas as pd
df = pd.DataFrame(data)
df
If some data is missing/incomplete, Pandas will replace it with 'NaN'
data.append(data[0].copy())
del(data[-1]['Staff'])
data[-1]['name'] = 'Incomplete Hostel'
pd.DataFrame(data)

Extracting Data from a Table in HTML using Selenium and Python

I have this assignment of extracting some items from each row of a table in HTML. I have figured out how to grab the whole table from the web using Selenium with Python. Following is the code for that:
from selenium import webdriver
import time
import pandas as pd
mydriver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
mydriver.get("https://www.bseindia.com/corporates/ann.aspx?expandable=0")
time.sleep(5) # wait 5 seconds until DOM will load completly
table = mydriver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_lblann"]/table/tbody')
for row in table.find_elements_by_xpath('./tr'):
print(row.text)
I am unable to understand the way I can grab specific items from the table itself. Following are the items that I require:
Company Name
PDF Link(if it does not exist, write "No PDF Link")
Received Time
Dessiminated Time
Time Taken
Description
Any help in logic would be helpful.
Thanks in Advance.
for tr in mydriver.find_elements_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_lblann"]/table//tr'):
tds = tr.find_elements_by_tag_name('td')
print ([td.text for td in tds])
I went through a rough time to get this working. I think it works just fine now. Its pretty inefficient though. Following is the code:
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
mydriver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
mydriver.get("https://www.bseindia.com/corporates/ann.aspx?expandable=0")
time.sleep(5) # wait 5 seconds until DOM will load completly
trs = mydriver.find_elements_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_lblann"]/table/tbody/tr')
del trs[0]
names = []
r_time = []
d_time = []
t_taken = []
desc = []
pdfs = []
codes = []
i = 0
while i < len(trs):
names.append(trs[i].text)
l = trs[i].text.split()
for item in l:
try:
code = int(item)
if code > 100000:
codes.append(code)
except:
pass
link = trs[i].find_elements_by_tag_name('td')
pdf_count = 2
while pdf_count < len(link):
try:
pdf = link[pdf_count].find_element_by_tag_name('a')
pdfs.append(pdf.get_attribute('href'))
except NoSuchElementException:
pdfs.append("No PDF")
pdf_count = pdf_count + 4
time = trs[i + 1].text.split()
if len(time) == 5:
r_time.append("No Time Given")
d_time.append(time[3] + " " + time[4])
t_taken.append("No Time Given")
else:
r_time.append(time[3] + " " + time[4])
d_time.append(time[8] + " " + time[9])
t_taken.append(time[12])
desc.append(trs[i+2].text)
i = i + 4
df = pd.DataFrame.from_dict({'Name':names,'Description':desc, 'PDF Link' : pdfs,'Company Code' : codes, 'Received Time' : r_time, 'Disseminated Time' : d_time, 'Time Taken' : t_taken})
df.to_excel('corporate.xlsx', header=True, index=False) #print the data in the excel sheet.
Also, I have added another aspect that was asked, I got the company code in another column as well. Thats the result I get.

Beautifulsoup 4 Filtering Python 3 Issue

Well I have been looking at this for 6 hours and can't figure this out. I want to use Beautifulsoup to filter data from a webpage but I can't get .contents or get_text() to work and I have no clue where I am going wrong or how to do another filter on the first pass. I can get to the "fields tag" but can't narrow down to the tags to get the data. Sorry if this is a simple issue that I am doing wrong, I only started Python yesterday and started (trying atleast) web scraping this morning.
Entire Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from openpyxl import Workbook
import bs4 as bs
import math
book = Workbook()
sheet = book.active
i=0
#Change this value to your starting tracking number
StartingTrackingNumber=231029883
#Change this value to increase or decrease the number of tracking numbers you want to search overal
TrackingNumberCount = 4
#Number of Tacking Numbers Searched at One Time
QtySearch = 4
#TrackingNumbers=["Test","Test 2"]
for i in range(0,TrackingNumberCount):
g=i+StartingTrackingNumber
sheet.cell(row=i+1,column=1).value = 'RN' + str(g) + 'CA,'
TrackingNumbers = []
for col in sheet['A']:
TrackingNumbers.append(col.value)
MaxRow = sheet.max_row
MaxIterations = math.ceil(MaxRow / QtySearch)
#print(MaxIterations)
RowCount = 0
LastTrackingThisPass = QtySearch
for RowCount in range (0,MaxIterations): #range(1,MaxRow):
FirstTrackingThisPass = (RowCount)*QtySearch
x = TrackingNumbers[FirstTrackingThisPass:LastTrackingThisPass]
LastTrackingThisPass+=QtySearch
driver = webdriver.Safari()
driver.set_page_load_timeout(20)
driver.get("https://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?execution=e1s1")
driver.find_element_by_xpath('//*[contains(#id, "trackNumbers")]').send_keys(x)
driver.find_element_by_xpath('//*[contains(#id, "submit_button")]').send_keys(chr(13))
driver.set_page_load_timeout(3000)
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.ID, "noResults_modal")))
SourceCodeTest = driver.page_source
#print(SourceCodeTest)
Soup = bs.BeautifulSoup(SourceCodeTest, "lxml") #""html.parser")
z = 3
#for z in range (1,5):
# t = str(z)
# NameCheck = "trackingNumber" + t
##FindTrackingNumbers = Soup.find_all("div", {"id": "trackingNumber3"})
# FindTrackingNumbers = Soup.find_all("div", {"id": NameCheck})
# print(FindTrackingNumbers)
Info = Soup.find_all("fieldset", {"class": "trackhistoryitem"}, "strong")
print(Info.get_text())
Desired Output:
RN231029885CA N/A
RN231029884CA N/A
RN231029883CA 2017/04/04
Sample of the HTML trying to be parsed:
<fieldset class="trackhistoryitem">
<p><strong>Tracking No. </strong><br><input type="hidden" name="ID_RN231029885CA" value="false">RN231029885CA
</p>
<p><strong>Date / Time </strong><br>
<!--h:outputText value="N/A" rendered="true"/>
<h:outputText value="N/A - N/A" rendered="false"/>
<h:outputText value="N/A" rendered="false"/-->N/A
</p>
<p><strong>Description </strong><br><span id="tapListResultForm:tapResultsItems:1:trk_rl_div_1">
Using .get_text() I got back this long ugly string:
'\nTracking No. RN231029885CA\n \nDate / Time \nN/A\n \nDescription '
So with some of pythons string functions:
objects = []
for each in soup.find_all("fieldset"):
each = each.get_text().split("\n") #split the ugly string up
each = [each[1][-13:], each[4]] #grab the parts you want, rmv extra words
objects.append(each)
Note: This assumes all tracking numbers are 13 digits long, if not you'll need to use regex or some other creative method to extract it.

Python BeautifulSoup scrape Yahoo Finance value

I am attempting to scrape the 'Full Time Employees' value of 110,000 from the Yahoo finance website.
The URL is: http://finance.yahoo.com/quote/AAPL/profile?p=AAPL
I have tried using Beautiful soup, but I can't find the value on the page. When I look in the DOM explorer in IE, I can see it. It has a tag with a parent tag which has a parent which has a parent . The actual value is in a custom class of data-react-id.
code I have tried:
from bs4 import BeautifulSoup as bs
html=`http://finance.yahoo.com/quote/AAPL/profile?p=AAPL`
r = requests.get(html).content
soup = bs(r)
Not sure where to go.
The problem is in the "requests" related part - the page you download with requests is not the same as you see in the browser. Browser executed all of the javascript, made multiple asynchronous requests needed to load this page. And, this particular page is quite dynamic itself. There is a lot happening on the "client-side".
What you can do is to load this page in a real browser automated by selenium. Working example:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://finance.yahoo.com/quote/AAPL/profile?p=AAPL")
# wait for the Full Time Employees to be visible
wait = WebDriverWait(driver, 10)
employees = wait.until(EC.visibility_of_element_located((By.XPATH, "//span[. = 'Full Time Employees']/following-sibling::strong")))
print(employees.text)
driver.close()
Prints 110,000.
There are so many ways to download financial data, or any kind of data, from the web. The script below downloads stock prices and saves everything to a CSV file.
import urllib2
listOfStocks = ["AAPL", "MSFT", "GOOG", "FB", "AMZN"]
urls = []
for company in listOfStocks:
urls.append('http://real-chart.finance.yahoo.com/table.csv?s=' + company + '&d=6&e=28&f=2015&g=m&a=11&b=12&c=1980&ignore=.csv')
Output_File = open('C:/Users/your_path/Historical_Prices.csv','w')
New_Format_Data = ''
for counter in range(0, len(urls)):
Original_Data = urllib2.urlopen(urls[counter]).read()
if counter == 0:
New_Format_Data = "Company," + urllib2.urlopen(urls[counter]).readline()
rows = Original_Data.splitlines(1)
for row in range(1, len(rows)):
New_Format_Data = New_Format_Data + listOfStocks[counter] + ',' + rows[row]
Output_File.write(New_Format_Data)
Output_File.close()
The script below will download multiple stock tickers into one folder.
import urllib
import re
import json
symbolslist = open("C:/Users/rshuell001/Desktop/symbols/tickers.txt").read()
symbolslist = symbolslist.split("\n")
for symbol in symbolslist:
myfile = open("C:/Users/your_path/Desktop/symbols/" +symbol +".txt", "w+")
myfile.close()
htmltext = urllib.urlopen("http://www.bloomberg.com/markets/chart/data/1D/"+ symbol+ ":US")
data = json.load(htmltext)
datapoints = data["data_values"]
myfile = open("C:/Users/rshuell001/Desktop/symbols/" +symbol +".txt", "a")
for point in datapoints:
myfile.write(str(symbol+","+str(point[0])+","+str(point[1])+"\n"))
myfile.close()
Finally...this will download prices for multiple stock tickers...
import urllib
import re
symbolfile = open("C:/Users/your_path/Desktop/symbols/amex.txt")
symbollist = symbolfile.read()
newsymbolslist = symbollist.split("\n")
i=0
while i<len(newsymbolslist):
url = "http://finance.yahoo.com/q?s=" + newsymbolslist[i] + "&ql=1"
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
regex = '<span id="yfs_l84_' + newsymbolslist[i] + '">(.+?)</span>'
pattern = re.compile(regex)
price = re.findall(pattern,htmltext)
print "the price of ", newsymbolslist[i] , "is", price[0]
i+=1
# Make sure you place the 'amex.txt' file in 'C:\Python27\'
I wrote a book about these kinds of things, and lots of other stuff. You can find it using the URL below.
https://www.amazon.com/Automating-Business-Processes-Reducing-Increasing-ebook/dp/B01DJJKVZC/ref=sr_1_1?

Categories