Well I have been looking at this for 6 hours and can't figure this out. I want to use Beautifulsoup to filter data from a webpage but I can't get .contents or get_text() to work and I have no clue where I am going wrong or how to do another filter on the first pass. I can get to the "fields tag" but can't narrow down to the tags to get the data. Sorry if this is a simple issue that I am doing wrong, I only started Python yesterday and started (trying atleast) web scraping this morning.
Entire Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from openpyxl import Workbook
import bs4 as bs
import math
book = Workbook()
sheet = book.active
i=0
#Change this value to your starting tracking number
StartingTrackingNumber=231029883
#Change this value to increase or decrease the number of tracking numbers you want to search overal
TrackingNumberCount = 4
#Number of Tacking Numbers Searched at One Time
QtySearch = 4
#TrackingNumbers=["Test","Test 2"]
for i in range(0,TrackingNumberCount):
g=i+StartingTrackingNumber
sheet.cell(row=i+1,column=1).value = 'RN' + str(g) + 'CA,'
TrackingNumbers = []
for col in sheet['A']:
TrackingNumbers.append(col.value)
MaxRow = sheet.max_row
MaxIterations = math.ceil(MaxRow / QtySearch)
#print(MaxIterations)
RowCount = 0
LastTrackingThisPass = QtySearch
for RowCount in range (0,MaxIterations): #range(1,MaxRow):
FirstTrackingThisPass = (RowCount)*QtySearch
x = TrackingNumbers[FirstTrackingThisPass:LastTrackingThisPass]
LastTrackingThisPass+=QtySearch
driver = webdriver.Safari()
driver.set_page_load_timeout(20)
driver.get("https://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?execution=e1s1")
driver.find_element_by_xpath('//*[contains(#id, "trackNumbers")]').send_keys(x)
driver.find_element_by_xpath('//*[contains(#id, "submit_button")]').send_keys(chr(13))
driver.set_page_load_timeout(3000)
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.ID, "noResults_modal")))
SourceCodeTest = driver.page_source
#print(SourceCodeTest)
Soup = bs.BeautifulSoup(SourceCodeTest, "lxml") #""html.parser")
z = 3
#for z in range (1,5):
# t = str(z)
# NameCheck = "trackingNumber" + t
##FindTrackingNumbers = Soup.find_all("div", {"id": "trackingNumber3"})
# FindTrackingNumbers = Soup.find_all("div", {"id": NameCheck})
# print(FindTrackingNumbers)
Info = Soup.find_all("fieldset", {"class": "trackhistoryitem"}, "strong")
print(Info.get_text())
Desired Output:
RN231029885CA N/A
RN231029884CA N/A
RN231029883CA 2017/04/04
Sample of the HTML trying to be parsed:
<fieldset class="trackhistoryitem">
<p><strong>Tracking No. </strong><br><input type="hidden" name="ID_RN231029885CA" value="false">RN231029885CA
</p>
<p><strong>Date / Time </strong><br>
<!--h:outputText value="N/A" rendered="true"/>
<h:outputText value="N/A - N/A" rendered="false"/>
<h:outputText value="N/A" rendered="false"/-->N/A
</p>
<p><strong>Description </strong><br><span id="tapListResultForm:tapResultsItems:1:trk_rl_div_1">
Using .get_text() I got back this long ugly string:
'\nTracking No. RN231029885CA\n \nDate / Time \nN/A\n \nDescription '
So with some of pythons string functions:
objects = []
for each in soup.find_all("fieldset"):
each = each.get_text().split("\n") #split the ugly string up
each = [each[1][-13:], each[4]] #grab the parts you want, rmv extra words
objects.append(each)
Note: This assumes all tracking numbers are 13 digits long, if not you'll need to use regex or some other creative method to extract it.
Related
I have this scraper I am trying to export as a csv file in Google Colab. I received the scraped information as a string value, but I cannot convert it to a csv. I want each scraped attribute "title", "size", etc to populate a column in a csv file. I have ran the strings through Beautiful soup to remove the HTML formatting. Please see my code below to help.
import pandas as pd
import time
import io
from io import StringIO
import csv
#from google.colab import drive
#drive.mount('drive')
#Use new Library (kora.selenium) to run chromedriver
from kora.selenium import wd
#Import BeautifulSoup to parse HTML formatting
from bs4 import BeautifulSoup
wd.get("https://www.grailed.com/sold/EP8S3v8V_w") #Get webpage
ScrollNumber=round(200/40)+1
for i in range(0,ScrollNumber):
wd.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
#--------------#
#Each new attribute will have to found using XPATH because Grailed's website is written in Javascript (js.react) not HTML
#Only 39 results will show because the JS page is infinite scroll and selenium must be told to keep scrolling.
follow_loop = range(2, 200)
for x in follow_loop:
#Title
title = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
title += str(x)
title += "]/a/div[3]/div[2]/p"
title = wd.find_elements_by_xpath(title)
title = str(title)
#Price
price = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
price += str(x)
price += "]/div/div/p/span"
price = wd.find_elements_by_xpath(price)
price = str(price)
#Size
size = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
size += str(x)
size += "]/a/div[3]/div[1]/p[2]"
size = wd.find_elements_by_xpath(size)
size = str(size)
#Sold
sold = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
sold += str(x)
sold += "]/a/p/span"
sold = wd.find_elements_by_xpath(sold)
sold = str(sold)
#Clean HTML formatting using Beautiful soup
cleantitle = BeautifulSoup(title, "lxml").text
cleanprice = BeautifulSoup(price, "lxml").text
cleansize = BeautifulSoup(size, "lxml").text
cleansold = BeautifulSoup(sold, "lxml").text
This was a lot of work lol
from selenium import webdriver
import time
import csv
driver = webdriver.Chrome()
driver.get("https://www.grailed.com/sold/EP8S3v8V_w")
scroll_count = round(200 / 40) + 1
for i in range(scroll_count):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
time.sleep(3)
titles = driver.find_elements_by_css_selector("p.listing-designer")
prices = driver.find_elements_by_css_selector("p.sub-title.sold-price")
sizes = driver.find_elements_by_css_selector("p.listing-size.sub-title")
sold = driver.find_elements_by_css_selector("div.-overlay")
data = [titles, prices, sizes, sold]
data = [list(map(lambda element: element.text, arr)) for arr in data]
with open('sold_shoes.csv', 'w') as file:
writer = csv.writer(file)
j = 0
while j < len(titles):
row = []
for i in range(len(data)):
row.append(data[i][j])
writer.writerow(row)
j += 1
I'm not sure why it makes a newline between every row in the file, but I assume it's not a problem. Also, it's a naïve solution in that it assumes the size of each list is the same, consider using one list and making new lists from the child elements of the parent. Also, I just used Selenium without BeautifulSoup because it's easier for me, but you should learn BS too because it's faster for scraping than Selenium. Happy coding.
I have several URLs which link to Hotel pages and I would like to scrape some data from it.
I'm using the following this script, but I would like to update it:
data=[]
for i in range(0,10):
url = final_list[i]
driver2 = webdriver.Chrome()
driver2.get(url)
sleep(randint(10,20))
soup = BeautifulSoup(driver2.page_source, 'html.parser')
my_table2 = soup.find_all(class_=['title-2', 'rating-score body-3'])
review=soup.find_all(class_='reviews')[-1]
try:
price=soup.find_all('span', attrs={'class':'price'})[-1]
except:
price=soup.find_all('span', attrs={'class':'price'})
for tag in my_table2:
data.append(tag.text.strip())
for p in price:
data.append(p)
for r in review:
data.append(r)
But here's the problem, tag.text.strip() scrape rating numbers like here :
It will strip the number rating into alone value but some hotels don't have the same amout of ratings. Here's a hotel with 7 ratings, the default number is 8. Some have seven ratings, other six, and so on. So in the end, my dataframe is quite screwed. If the hotel doesn't have 8 ratings, the value will be shifted.
My question is : How to tell the script "if there is a value in this tag.text.strip(i) so put the value but if there isn't put None. And of course made that for the eight value.
I tried several things like :
for tag in my_table2:
for i in tag.text.strip()[i]:
if i:
data.append(i)
else:
data.append(None)
But unfortunately, that goes nowhere, so if you could help to figure out the answer, it would be awesome :)
If that could help you, I put link on Hotel that I'm scraping :
https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1
The number ratings are at the end
Thank you.
A few suggestions:
Put your data in a dictionary. You don't have to assume that all tags are present and the order of the tags doesn't matter. You can get the labels and the corresponding ratings with
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
and then iterate over both lists with zip
move your driver outside of the loop, opening it once is enough
don't use wait but you use Selenium's wait functions. You can wait for a particular element to be present or populated with WebDriverWait(driver, 10).until(EC.presence_of_element_located(your_element)
https://selenium-python.readthedocs.io/waits.html
Cache your scraped HTML code to a file. It's faster for you and politer to the website you are scraping
import selenium
import selenium.webdriver
import time
import random
import os
from bs4 import BeautifulSoup
data = []
final_list = [
'https://www.hostelworld.com/pwa/hosteldetails.php/Itaca-Hostel/Barcelona/1279?from=2020-11-21&to=2020-11-22&guests=1',
'https://www.hostelworld.com/pwa/hosteldetails.php/Be-Ramblas-Hostel/Barcelona/435?from=2020-11-27&to=2020-11-28&guests=1'
]
# load your driver only once to save time
driver = selenium.webdriver.Chrome()
for url in final_list:
data.append({})
# cache the HTML code to the filesystem
# generate a filename from the URL where all non-alphanumeric characters (e.g. :/) are replaced with underscores _
filename = ''.join([s if s.isalnum() else '_' for s in url])
if not os.path.isfile(filename):
driver.get(url)
# better use selenium's wait functions here
time.sleep(random.randint(10, 20))
source = driver.page_source
with open(filename, 'w', encoding='utf-8') as f:
f.write(source)
else:
with open(filename, 'r', encoding='utf-8') as f:
source = f.read()
soup = BeautifulSoup(source, 'html.parser')
review = soup.find_all(class_='reviews')[-1]
try:
price = soup.find_all('span', attrs={'class':'price'})[-1]
except:
price = soup.find_all('span', attrs={'class':'price'})
data[-1]['name'] = soup.find_all(class_=['title-2'])[0].text.strip()
rating_labels = soup.find_all(class_=['rating-label body-3'])
rating_scores = soup.find_all(class_=['rating-score body-3'])
assert len(rating_labels) == len(rating_scores)
for label, score in zip(rating_labels, rating_scores):
data[-1][label.text.strip()] = score.text.strip()
data[-1]['price'] = price.text.strip()
data[-1]['review'] = review.text.strip()
The data can then be easily put in a nicely formatted table using Pandas
import pandas as pd
df = pd.DataFrame(data)
df
If some data is missing/incomplete, Pandas will replace it with 'NaN'
data.append(data[0].copy())
del(data[-1]['Staff'])
data[-1]['name'] = 'Incomplete Hostel'
pd.DataFrame(data)
i can't figure it out how to get TEXT and NUMBERS from this tag <td>THERE IS TEXT I WANT TO GET</td> and there is "Quantity" also with <td>QUANTITY</td>
link:https://bscscan.com/tokenholdings?a=0x00a2c3d755c21bc837a3ca9a32279275eae9e3d6
there is image what i want to get.
thanks in advance
The table in the website is loaded dynamically, so you can't scrape it using requests. You have to use selenium in order to do it. Here is the full code:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
url = 'https://bscscan.com/tokenholdings?a=0x00a2c3d755c21bc837a3ca9a32279275eae9e3d6'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)
html = driver.page_source
driver.close()
soup = BeautifulSoup(html,'html5lib')
tbody = soup.find('tbody', id = "tb1")
tr_tags = tbody.find_all('tr')
symbols = []
quantities = []
for tr in tr_tags:
td_tags = tr.find_all('td')
symbols.append(td_tags[2].text)
quantities.append(td_tags[3].text)
df = pd.DataFrame((symbols,quantities))
df = df.T
df.columns = ['Symbol','Quantity']
print(df)
Output:
Symbol Quantity
0 BNB 17.98420742
1 Cake 19.76899295
2 ANY 1
3 FREE 1,502
4 LFI 326.87340092
5 LFI 326.87340092
I recommend a really good tool called re, and you can search the specific string from two substrings, e.g.
import re
s = ''<td>THERE IS TEXT I WANT TO GET</td>"
result = re.search('<td>(.*)</td>', s)
print(result.group(1))
>>> html="<td>THERE IS TEXT I WANT TO GET</td>\n<td>THERE IS TEXT I WANT TO GET</td>\n<td>THERE IS TEXT I WANT TO GET</td>\n<td>THERE IS TEXT I WANT TO GET</td>"
>>> soup = BeautifulSoup(html)
>>> for td in soup.find_all('td'): print(td.text)
in the code sample below, 3 of the 5 elements I am attempting to scrape return values as expected. 2 (goals_scored and assists) return no values. I have verified that the data does exist on the web page and that I am using the correct attribute, but not sure why results are not returning. Is there something obvious I am overlooking?
import sys
from bs4 import BeautifulSoup as bs
import urllib2
import datetime as dt
import time
import pandas as pd
proxy_support = urllib2.ProxyHandler({})
opener = urllib2.build_opener(proxy_support)
player_name=[]
club =[]
position = []
goals_scored = []
assists = []
for p in range(25):
player_url = 'http://www.mlssoccer.com/stats/season?page={p}&franchise=select&year=2017&season_type=REG&group=goals'.format(
p=p)
page = opener.open(player_url).read()
player_soup = bs(page,"lxml")
print >>sys.stderr, '[{time}] Running page {n}...'.format(
time=dt.datetime.now(), n=p)
length = len(player_soup.find('tbody').findAll('tr'))
for row in range(0, length):
try:
name = player_soup.find('tbody').findAll('td', attrs={'data-title': 'Player'})[row].find('a').contents[0]
player_name.append(name)
team = player_soup.find('tbody').findAll('td', attrs={'data-title': 'Club'})[row].contents[0]
club.append(team)
pos = player_soup.find('tbody').findAll('td', attrs={'data-title': 'POS'})[row].contents[0]
position.append(pos)
goals = player_soup.find('tbody').findAll('td', attrs={'data-title': 'G' ,'class': 'responsive'})[row].contents[0]
goals_scored.apppend(goals)
a = player_soup.find('tbody').findAll('td', attrs={'data-title': 'A'})[row].contents[0]
assists.append(a)
except:
pass
player_data = {'player_name':player_name,
'club':club,
'position' : position,
'goals_scored' : goals_scored,
'assists' : assists,
}
df = pd.DataFrame.from_dict(player_data,orient='index')
df
The only thing I can figure out is that there is a slight difference in the HTML for the variables not returning data. Do i need to include the class= responsive in my code? If so, any examples of how that might look?
Position HTML : F
Goals HTML: 11
Any insight is appreciated
You can try like this to get your desired data. I've only parsed the portion you needed. The rest you can do for dataframe. FYI, there are two types of classes attached to different td tags. odd and even. Don't forget to consider that as well.
from bs4 import BeautifulSoup
import requests
page_url = "https://www.mlssoccer.com/stats/season?page={0}&franchise=select&year=2017&season_type=REG&group=goals"
for url in [page_url.format(p) for p in range(5)]:
soup = BeautifulSoup(requests.get(url).text, "lxml")
table = soup.select("table")[0]
for items in table.select(".odd,.even"):
player = items.select("td[data-title='Player']")[0].text
club = items.select("td[data-title='Club']")[0].text
position = items.select("td[data-title='POS']")[0].text
goals = items.select("td[data-title='G']")[0].text
assist = items.select("td[data-title='A']")[0].text
print(player,club,position,goals,assist)
Partial result looks like:
Nemanja Nikolic CHI F 24 4
Diego Valeri POR M 21 11
Ola Kamara CLB F 18 3
As I've included both the classes in my script so you will get all data from that site.
I am attempting to scrape the 'Full Time Employees' value of 110,000 from the Yahoo finance website.
The URL is: http://finance.yahoo.com/quote/AAPL/profile?p=AAPL
I have tried using Beautiful soup, but I can't find the value on the page. When I look in the DOM explorer in IE, I can see it. It has a tag with a parent tag which has a parent which has a parent . The actual value is in a custom class of data-react-id.
code I have tried:
from bs4 import BeautifulSoup as bs
html=`http://finance.yahoo.com/quote/AAPL/profile?p=AAPL`
r = requests.get(html).content
soup = bs(r)
Not sure where to go.
The problem is in the "requests" related part - the page you download with requests is not the same as you see in the browser. Browser executed all of the javascript, made multiple asynchronous requests needed to load this page. And, this particular page is quite dynamic itself. There is a lot happening on the "client-side".
What you can do is to load this page in a real browser automated by selenium. Working example:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://finance.yahoo.com/quote/AAPL/profile?p=AAPL")
# wait for the Full Time Employees to be visible
wait = WebDriverWait(driver, 10)
employees = wait.until(EC.visibility_of_element_located((By.XPATH, "//span[. = 'Full Time Employees']/following-sibling::strong")))
print(employees.text)
driver.close()
Prints 110,000.
There are so many ways to download financial data, or any kind of data, from the web. The script below downloads stock prices and saves everything to a CSV file.
import urllib2
listOfStocks = ["AAPL", "MSFT", "GOOG", "FB", "AMZN"]
urls = []
for company in listOfStocks:
urls.append('http://real-chart.finance.yahoo.com/table.csv?s=' + company + '&d=6&e=28&f=2015&g=m&a=11&b=12&c=1980&ignore=.csv')
Output_File = open('C:/Users/your_path/Historical_Prices.csv','w')
New_Format_Data = ''
for counter in range(0, len(urls)):
Original_Data = urllib2.urlopen(urls[counter]).read()
if counter == 0:
New_Format_Data = "Company," + urllib2.urlopen(urls[counter]).readline()
rows = Original_Data.splitlines(1)
for row in range(1, len(rows)):
New_Format_Data = New_Format_Data + listOfStocks[counter] + ',' + rows[row]
Output_File.write(New_Format_Data)
Output_File.close()
The script below will download multiple stock tickers into one folder.
import urllib
import re
import json
symbolslist = open("C:/Users/rshuell001/Desktop/symbols/tickers.txt").read()
symbolslist = symbolslist.split("\n")
for symbol in symbolslist:
myfile = open("C:/Users/your_path/Desktop/symbols/" +symbol +".txt", "w+")
myfile.close()
htmltext = urllib.urlopen("http://www.bloomberg.com/markets/chart/data/1D/"+ symbol+ ":US")
data = json.load(htmltext)
datapoints = data["data_values"]
myfile = open("C:/Users/rshuell001/Desktop/symbols/" +symbol +".txt", "a")
for point in datapoints:
myfile.write(str(symbol+","+str(point[0])+","+str(point[1])+"\n"))
myfile.close()
Finally...this will download prices for multiple stock tickers...
import urllib
import re
symbolfile = open("C:/Users/your_path/Desktop/symbols/amex.txt")
symbollist = symbolfile.read()
newsymbolslist = symbollist.split("\n")
i=0
while i<len(newsymbolslist):
url = "http://finance.yahoo.com/q?s=" + newsymbolslist[i] + "&ql=1"
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
regex = '<span id="yfs_l84_' + newsymbolslist[i] + '">(.+?)</span>'
pattern = re.compile(regex)
price = re.findall(pattern,htmltext)
print "the price of ", newsymbolslist[i] , "is", price[0]
i+=1
# Make sure you place the 'amex.txt' file in 'C:\Python27\'
I wrote a book about these kinds of things, and lots of other stuff. You can find it using the URL below.
https://www.amazon.com/Automating-Business-Processes-Reducing-Increasing-ebook/dp/B01DJJKVZC/ref=sr_1_1?