I am attempting to scrape the 'Full Time Employees' value of 110,000 from the Yahoo finance website.
The URL is: http://finance.yahoo.com/quote/AAPL/profile?p=AAPL
I have tried using Beautiful soup, but I can't find the value on the page. When I look in the DOM explorer in IE, I can see it. It has a tag with a parent tag which has a parent which has a parent . The actual value is in a custom class of data-react-id.
code I have tried:
from bs4 import BeautifulSoup as bs
html=`http://finance.yahoo.com/quote/AAPL/profile?p=AAPL`
r = requests.get(html).content
soup = bs(r)
Not sure where to go.
The problem is in the "requests" related part - the page you download with requests is not the same as you see in the browser. Browser executed all of the javascript, made multiple asynchronous requests needed to load this page. And, this particular page is quite dynamic itself. There is a lot happening on the "client-side".
What you can do is to load this page in a real browser automated by selenium. Working example:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("http://finance.yahoo.com/quote/AAPL/profile?p=AAPL")
# wait for the Full Time Employees to be visible
wait = WebDriverWait(driver, 10)
employees = wait.until(EC.visibility_of_element_located((By.XPATH, "//span[. = 'Full Time Employees']/following-sibling::strong")))
print(employees.text)
driver.close()
Prints 110,000.
There are so many ways to download financial data, or any kind of data, from the web. The script below downloads stock prices and saves everything to a CSV file.
import urllib2
listOfStocks = ["AAPL", "MSFT", "GOOG", "FB", "AMZN"]
urls = []
for company in listOfStocks:
urls.append('http://real-chart.finance.yahoo.com/table.csv?s=' + company + '&d=6&e=28&f=2015&g=m&a=11&b=12&c=1980&ignore=.csv')
Output_File = open('C:/Users/your_path/Historical_Prices.csv','w')
New_Format_Data = ''
for counter in range(0, len(urls)):
Original_Data = urllib2.urlopen(urls[counter]).read()
if counter == 0:
New_Format_Data = "Company," + urllib2.urlopen(urls[counter]).readline()
rows = Original_Data.splitlines(1)
for row in range(1, len(rows)):
New_Format_Data = New_Format_Data + listOfStocks[counter] + ',' + rows[row]
Output_File.write(New_Format_Data)
Output_File.close()
The script below will download multiple stock tickers into one folder.
import urllib
import re
import json
symbolslist = open("C:/Users/rshuell001/Desktop/symbols/tickers.txt").read()
symbolslist = symbolslist.split("\n")
for symbol in symbolslist:
myfile = open("C:/Users/your_path/Desktop/symbols/" +symbol +".txt", "w+")
myfile.close()
htmltext = urllib.urlopen("http://www.bloomberg.com/markets/chart/data/1D/"+ symbol+ ":US")
data = json.load(htmltext)
datapoints = data["data_values"]
myfile = open("C:/Users/rshuell001/Desktop/symbols/" +symbol +".txt", "a")
for point in datapoints:
myfile.write(str(symbol+","+str(point[0])+","+str(point[1])+"\n"))
myfile.close()
Finally...this will download prices for multiple stock tickers...
import urllib
import re
symbolfile = open("C:/Users/your_path/Desktop/symbols/amex.txt")
symbollist = symbolfile.read()
newsymbolslist = symbollist.split("\n")
i=0
while i<len(newsymbolslist):
url = "http://finance.yahoo.com/q?s=" + newsymbolslist[i] + "&ql=1"
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
regex = '<span id="yfs_l84_' + newsymbolslist[i] + '">(.+?)</span>'
pattern = re.compile(regex)
price = re.findall(pattern,htmltext)
print "the price of ", newsymbolslist[i] , "is", price[0]
i+=1
# Make sure you place the 'amex.txt' file in 'C:\Python27\'
I wrote a book about these kinds of things, and lots of other stuff. You can find it using the URL below.
https://www.amazon.com/Automating-Business-Processes-Reducing-Increasing-ebook/dp/B01DJJKVZC/ref=sr_1_1?
Related
I'm trying to do a scrape where the landing page has various links (the 5 sub categories at the top):
https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html
Within each of these categories are a list of products https://mcavoyguns.co.uk/contents/en-uk/d411_Browning_B725_Shotguns.html
Each product listed has a link to get further details (a direct link to the product as an individual page) https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.html
The scrape I've put together so far will get as far as creating a list of all the individual page links required. But when I try to loop each individual product link for data, I cant seem to get the BeautifulSoup to map the data from those links. Its as though it stays on the previous page (if you will).
What am I missing to allow for that second "bounce" to the "product_link" address (eg https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.html) and allow me to scrape the data from there? I had thought I might need to add a time.sleep(5) timer to allow for all to load but still getting nothing.
Code:
from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas
import numpy as np
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
time.sleep(2)
all_Outlinks=[]
all_links=[]
soup = BeautifulSoup(browser.page_source, features="lxml")
submenuFind = soup.find("div", "idx2Submenu")
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub")
for submenuItem in submenuItems:
for link in submenuItem.select('a[href]'):
all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href'])
#print(all_Outlinks)
for a_link in all_Outlinks:
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
pageLinkDivs = soup.find_all("div", "column full")
for pageLinkDiv in pageLinkDivs:
for pageLink in pageLinkDiv.select('a[href]'):
all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
for product_link in all_links:
time.sleep(5)
resSecond = requests.get(product_link)
soup = BeautifulSoup(resSecond.text, 'html.parser')
model = soup.find("div", "GC75 ProductChoiceName")
print(model)
PS Apologies for the additional imports. They are copy and paste from a previous script to be removed once confirmed not required.
That info is pulled dynamically from a script tag when using browser. As using requests this will not be in the location you might be looking. Instead, pull that info from the script tag.
In this case, I pull all the info related to a given model that is within the script and generate a dataframe. I convert the string inside the script tag to a python object with ast. I add the product url and product title to the dataframe.
Each df is added to a list which is converted to a final dataframe. As I don't know what final header names would be required I have left some with their default names.
I have added in handling for the case(s) where there are no model options listed for the given product.
from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas as pd
import numpy as np
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
import ast
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
time.sleep(2)
all_Outlinks=[]
all_links=[]
soup = BeautifulSoup(browser.page_source, features="lxml")
submenuFind = soup.find("div", "idx2Submenu")
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub")
for submenuItem in submenuItems:
for link in submenuItem.select('a[href]'):
all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href'])
#print(all_Outlinks)
with requests.Session() as s:
for a_link in all_Outlinks:
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
pageLinkDivs = soup.find_all("div", "column full")
for pageLinkDiv in pageLinkDivs:
for pageLink in pageLinkDiv.select('a[href]'):
all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
results = []
for product_link in all_links:
# print(product_link)
resSecond = s.get(product_link)
soup = BeautifulSoup(resSecond.text, 'html.parser')
title = soup.select_one('.ProductTitle').text
try:
df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
df.iloc[:, -1] = product_link
except:
placeholder = ['No options listed'] * 8
placeholder.append(product_link)
df = pd.DataFrame([placeholder])
df.insert(0, 'title', title)
#print(df) # add headers you care about to df or do that at end on full list
results.append(df)
final = pd.concat(results) # or add header here
print(final)
You could then look at speeding/tidying things up:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import locale
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import ast
from multiprocessing import Pool, cpu_count
def get_models_df(product_link):
res = requests.get(product_link)
soup = BeautifulSoup(res.text, 'lxml')
title = soup.select_one('.ProductTitle').text
try:
df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
df.iloc[:, -1] = product_link
except:
placeholder = ['No options listed'] * 8
placeholder.append(product_link)
df = pd.DataFrame([placeholder])
df.insert(0, 'title', title)
return(df)
def get_all_pages(a_link):
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'lxml')
all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]
return all_links
if __name__ == '__main__':
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
browser.quit()
with Pool(cpu_count()-1) as p:
nested_links = p.map(get_all_pages , all_outlinks)
flat_list = [link for links in nested_links for link in links]
results = p.map(get_models_df, flat_list)
final = pd.concat(results)
#print(final)
final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)
So I said I would have a look at the other requested items and they are indeed available just with requests. Some things that needed handling:
Different headers present for different products; some missing headers
Some unicode characters (there are still some encoding things to look at)
Handling cases where description missing
Handling the more section
Updating certain output values so Excel doesn't convert them to dates
Handling of header nan
TODO:
One of the functions has now become a rabid monster and needs re-factoring into smaller friendly function calls.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import locale
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import ast
from multiprocessing import Pool, cpu_count
import numpy as np
import unicodedata
def get_models_df(product_link):
resSecond = requests.get(product_link)
soup = BeautifulSoup(resSecond.text, 'lxml')
title = soup.select_one('.ProductTitle').text
try:
df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
except:
placeholder = ['No options listed'] * 8
df = pd.DataFrame([placeholder])
df.insert(0, 'title', title)
df['price'] = ' '.join([soup.select_one("[property='product:price:amount']")['content'],
soup.select_one("[property='product:price:currency']")['content']])
df['weight'] = ' '.join([soup.select_one("[property='product:weight:value']")['content'],
soup.select_one("[property='product:weight:units']")['content']])
output_headers = ['Action frame', 'Barrel','Barrel finish','Barrel length',
'Barrel length (mm-inch)','Buttstock','Calibre','Chokes','Code',
'Drop at comb','Drop at heel','Forearm','Length','N/A','Notes',
'Options','Packaging','Sights','Stock style','Top rib','Weight','Wood','Wood grade'
]
df = pd.concat([df, pd.DataFrame(columns = output_headers)])
try:
description_table = pd.read_html(str(soup.select_one('.ProductDetailedDescription table, table')))[0].transpose()
description_table.dropna(axis=0, how='all',inplace=True)
headers = list(description_table.iloc[0,:])
headers[:] = ['N/A' if pd.isnull(np.array([header], dtype=object)) else header for header in headers]
for number, header in enumerate(headers):
temp = header.lower()
value = description_table.iloc[1, number]
if temp == 'calibre':
df[header] = "'" + value
elif temp == 'top rib' and 'mm' not in value:
df[header] = value + 'mm'
else:
df[header] = value
except:
pass # no table
description = soup.select_one('#ProductDetailsTab [title=More]')
if description is None:
desc = 'N/A'
else:
desc = '. '.join([i.text for i in soup.select('.ProductDescription li, .ProductDescription span') if i.text !=''])
if desc == '':
desc = soup.select_one('.ProductIntroduction').get_text()
df['desc'] = unicodedata.normalize('NFKD', desc)
df['product_link'] = product_link
return(df)
def get_all_pages(a_link):
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'lxml')
all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]
return all_links
if __name__ == '__main__':
#os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome()# executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
browser.quit()
with Pool(cpu_count()-1) as p:
nested_links = p.map(get_all_pages , all_outlinks)
flat_list = [link for links in nested_links for link in links]
results = p.map(get_models_df, flat_list)
final = pd.concat(results)
#print(final)
final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)
As QHarr pointed out Selenium was the answer. This gave me the direction to look at it with different eyes and allowed me to find the answer.
I'm posting as my answer but crediting #QHarr with the work based on work provided previous and the ongoing assistance to help lead to the solution.
from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas
import numpy as np
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
time.sleep(2)
all_Outlinks=[]
all_links=[]
soup = BeautifulSoup(browser.page_source, features="lxml")
submenuFind = soup.find("div", "idx2Submenu")
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub")
for submenuItem in submenuItems:
for link in submenuItem.select('a[href]'):
all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href'])
#print(all_Outlinks)
for a_link in all_Outlinks:
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
pageLinkDivs = soup.find_all("div", "column full")
for pageLinkDiv in pageLinkDivs:
for pageLink in pageLinkDiv.select('a[href]'):
all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
for product_link in all_links:
browser.get(product_link)
time.sleep(5)
soup = BeautifulSoup(browser.page_source, 'html.parser')
model = soup.find("div", "GC65 ProductOptions")
modelFind = soup.find('select', attrs={'name': re.compile('model')})
modelList = [x['origvalue'][:14] for x in modelFind.find_all('option')[1:]]
print(modelList)
Model print still a bit messy but can clean it up once get all requirements gathered.
I have this scraper I am trying to export as a csv file in Google Colab. I received the scraped information as a string value, but I cannot convert it to a csv. I want each scraped attribute "title", "size", etc to populate a column in a csv file. I have ran the strings through Beautiful soup to remove the HTML formatting. Please see my code below to help.
import pandas as pd
import time
import io
from io import StringIO
import csv
#from google.colab import drive
#drive.mount('drive')
#Use new Library (kora.selenium) to run chromedriver
from kora.selenium import wd
#Import BeautifulSoup to parse HTML formatting
from bs4 import BeautifulSoup
wd.get("https://www.grailed.com/sold/EP8S3v8V_w") #Get webpage
ScrollNumber=round(200/40)+1
for i in range(0,ScrollNumber):
wd.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
#--------------#
#Each new attribute will have to found using XPATH because Grailed's website is written in Javascript (js.react) not HTML
#Only 39 results will show because the JS page is infinite scroll and selenium must be told to keep scrolling.
follow_loop = range(2, 200)
for x in follow_loop:
#Title
title = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
title += str(x)
title += "]/a/div[3]/div[2]/p"
title = wd.find_elements_by_xpath(title)
title = str(title)
#Price
price = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
price += str(x)
price += "]/div/div/p/span"
price = wd.find_elements_by_xpath(price)
price = str(price)
#Size
size = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
size += str(x)
size += "]/a/div[3]/div[1]/p[2]"
size = wd.find_elements_by_xpath(size)
size = str(size)
#Sold
sold = "//*[#id='shop']/div/div/div[3]/div[2]/div/div["
sold += str(x)
sold += "]/a/p/span"
sold = wd.find_elements_by_xpath(sold)
sold = str(sold)
#Clean HTML formatting using Beautiful soup
cleantitle = BeautifulSoup(title, "lxml").text
cleanprice = BeautifulSoup(price, "lxml").text
cleansize = BeautifulSoup(size, "lxml").text
cleansold = BeautifulSoup(sold, "lxml").text
This was a lot of work lol
from selenium import webdriver
import time
import csv
driver = webdriver.Chrome()
driver.get("https://www.grailed.com/sold/EP8S3v8V_w")
scroll_count = round(200 / 40) + 1
for i in range(scroll_count):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
time.sleep(3)
titles = driver.find_elements_by_css_selector("p.listing-designer")
prices = driver.find_elements_by_css_selector("p.sub-title.sold-price")
sizes = driver.find_elements_by_css_selector("p.listing-size.sub-title")
sold = driver.find_elements_by_css_selector("div.-overlay")
data = [titles, prices, sizes, sold]
data = [list(map(lambda element: element.text, arr)) for arr in data]
with open('sold_shoes.csv', 'w') as file:
writer = csv.writer(file)
j = 0
while j < len(titles):
row = []
for i in range(len(data)):
row.append(data[i][j])
writer.writerow(row)
j += 1
I'm not sure why it makes a newline between every row in the file, but I assume it's not a problem. Also, it's a naïve solution in that it assumes the size of each list is the same, consider using one list and making new lists from the child elements of the parent. Also, I just used Selenium without BeautifulSoup because it's easier for me, but you should learn BS too because it's faster for scraping than Selenium. Happy coding.
Please help me with the below code. I want to print it in table format with total rows = 35, columns = 6.
from bs4 import BeautifulSoup
import requests
#import urllib.request
from tabulate import tabulate
from selenium import webdriver # for webdriver
from selenium.webdriver.chrome.options import Options # for suppressing the browser
class States():
def __init__(self):
url = "https://www.mohfw.gov.in/"
# self.res = requests.get(url)
# self.soup = BeautifulSoup(self.res.text, 'lxml')
self.op = webdriver.ChromeOptions()
self.op.add_argument('headless')
self.driver = webdriver.Chrome(executable_path= "C:\web drivers\drivers\chromedriver_win32\chromedriver.exe", options= self.op)
self.driver.get(url)
self.driver.find_element_by_class_name("open-table").click()
def get_data(self):
print("S.No" "Name of State / UT" "Active Cases*" "Cured/Discharged/Migrated*" "Deaths**" "Total Confirmed cases*")
self.base_table = self.driver.find_element_by_tag_name("table")
table_row = 35
table_cols = 6
for i in range(1, table_row +1):
for j in range(1, table_cols +1):
print(self.base_table.find_element_by_xpath("//*[#id='state-data']/div/div/div/div/table/tbody/tr[" +str(i)+"]/td[" + str(j) + "]").text)
state=States()
state.get_data()
Could you please provide the url for better understanding? If you are particularly looking to scrape a table data from web, the best way to look use BeautifulSoup. identify the class name and you can simply loop through the row and individual data. have a look at the following snippet
from bs4 import BeautifulSoup
import requests
url = "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=JYVEVKT1J5S5HQZEVYN1&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=boxoffice&ref_=chtbo_india_tr_rhs_1"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
movie_rating = []
movie_name = []
#identifying the table using class name
imdb_table = soup.find('table', class_ = 'chart full-width')
for imdb in imdb_table.find_all('tbody'):
#find all rows together
rows = imdb.find_all('tr')
#simply loop through individual element
for row in rows:
name = row.find('td', class_ = 'titleColumn').text
movie_name.append(re.sub('[^A-Za-z]+', ' ',name))
rating = row.find('td', class_ = 'ratingColumn imdbRating').text
movie_rating.append(float(rating))
I'm an absolute beginner experimenting web-scraping with Python.
I'm trying to extract the location of ATMs from this URL:
https://www.visa.com/atmlocator/mobile/index.jsp#(page:results,params:(query:'Tokyo,%20Japan'))
using the following code.
#Script to scrape locations and addresses from VISA's ATM locator
# import the necessary libraries (to be installed if not available):
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
#ChromeDriver
#(see https://chromedriver.chromium.org/getting-started as reference)
driver = webdriver.Chrome("C:/Users/DefaultUser/Local Settings/Application Data/Google/Chrome/Application/chromedriver.exe")
offices=[] #List to branches/ATM names
addresses=[] #List to branches/ATM locations
driver.get("https://www.visa.com/atmlocator/mobile/index.jsp#(page:results,params:(query:'Tokyo,%20Japan'))")
content = driver.page_source
soup = BeautifulSoup(content, features = "lxml")
#the following code extracts all the content inside the tags displaying the information requested
for a in soup.findAll('li',attrs={'class':'visaATMResultListItem'}):
name=a.find('li', attrs={'class':'data-label'})
address=a.find('li', attrs={'class':'data-label'})
offices.append(name.text)
addresses.append(address.text)
#next row defines the dataframe with the results of the extraction
df = pd.DataFrame({'Office':offices,'Address':addresses})
#next row displays dataframe content
print(df)
#export data to .CSV file named 'branches.csv'
with open('branches.csv', 'a') as f:
df.to_csv(f, header=True)
The script seems to work correctly, at first, since Chromedriver starts and shows the results as required in the browser, but no result is returned:
Empty DataFrame
Columns: [Office, Address]
Index: []
Process finished with exit code 0
Maybe I made a mistake in choosing the selectors?
Thank you very much for your help
The problem is with the locators, use
for a in soup.findAll('li',attrs={'class':'visaATMResultListItem'}):
name = a.find('p', attrs={'class':'visaATMPlaceName '})
address = a.find('p', attrs={'class':'visaATMAddress'})
offices.append(name.text)
addresses.append(address.text)
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import time
from bs4 import BeautifulSoup
import csv
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get("https://www.visa.com/atmlocator/mobile/index.jsp#(page:results,params:(query:'Tokyo,%20JAPAN'))")
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
na = []
addr = []
for name in soup.findAll("a", {'class': 'visaATMPlaceLink'}):
na.append(name.text)
for add in soup.findAll("p", {'class': 'visaATMAddress'}):
addr.append(add.get_text(strip=True, separator=" "))
with open('out.csv', 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(['Name', 'Address'])
for _na, _addr in zip(na, addr):
writer.writerow([_na, _addr])
driver.quit()
Output: Click-Here
I have a problem with writing the scraped data to a csv file. While the pages are loaded and the first part of the scripts works, the writing to csv causes a problem.
The question I have is: How can I write the data; Name, Home State and Backer state to a csv file? The following code only writes Category to the csv file.
Code:
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime
from collections import OrderedDict
import re
browser = webdriver.Firefox()
browser.get('https://www.kickstarter.com/discover?ref=nav')
categories = browser.find_elements_by_class_name('category-container')
category_links = []
for category_link in categories:
#Each item in the list is a tuple of the category's name and its link.
category_links.append((str(category_link.find_element_by_class_name('f3').text),
category_link.find_element_by_class_name('bg-white').get_attribute('href')))
scraped_data = []
now = datetime.now()
counter = 1
for category in category_links:
browser.get(category[1])
browser.find_element_by_class_name('sentence-open').click()
time.sleep(2)
browser.find_element_by_id('category_filter').click()
time.sleep(2)
for i in range(27):
try:
time.sleep(2)
browser.find_element_by_id('category_'+str(i)).click()
time.sleep(2)
except:
pass
projects = []
for project_link in browser.find_elements_by_class_name('clamp-3'):
projects.append(project_link.find_element_by_tag_name('a').get_attribute('href'))
for counter, project in enumerate(projects):
page1 = urllib.request.urlopen(projects[counter])
soup1 = BeautifulSoup(page1, "lxml")
page2 = urllib.request.urlopen(projects[counter].split('?')[0]+'/community')
soup2 = BeautifulSoup(page2, "lxml")
time.sleep(2)
print(str(counter)+': '+project+'\nStatus: Started.')
project_dict = OrderedDict()
project_dict['Category'] = category[0]
browser.get(project)
project_dict['Name'] = soup1.find(class_='type-24 type-28-sm type-38-md navy-700 medium mb3').text
project_dict['Home State'] = soup1.find(class_='nowrap navy-700 flex items-center medium type-12').text
try:
project_dict['Backer State'] = soup2.find(class_='location-list-wrapper js-location-list-wrapper').text
except:
pass
print('Status: Done.')
counter+=1
scraped_data.append(project_dict)
later = datetime.now()
diff = later - now
print('The scraping took '+str(round(diff.seconds/60.0,2))+' minutes, and scraped '+str(len(scraped_data))+' projects.')
df = pd.DataFrame(scraped_data)
df.to_csv('kickstarter-data1.csv')