I'm an absolute beginner experimenting web-scraping with Python.
I'm trying to extract the location of ATMs from this URL:
https://www.visa.com/atmlocator/mobile/index.jsp#(page:results,params:(query:'Tokyo,%20Japan'))
using the following code.
#Script to scrape locations and addresses from VISA's ATM locator
# import the necessary libraries (to be installed if not available):
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
#ChromeDriver
#(see https://chromedriver.chromium.org/getting-started as reference)
driver = webdriver.Chrome("C:/Users/DefaultUser/Local Settings/Application Data/Google/Chrome/Application/chromedriver.exe")
offices=[] #List to branches/ATM names
addresses=[] #List to branches/ATM locations
driver.get("https://www.visa.com/atmlocator/mobile/index.jsp#(page:results,params:(query:'Tokyo,%20Japan'))")
content = driver.page_source
soup = BeautifulSoup(content, features = "lxml")
#the following code extracts all the content inside the tags displaying the information requested
for a in soup.findAll('li',attrs={'class':'visaATMResultListItem'}):
name=a.find('li', attrs={'class':'data-label'})
address=a.find('li', attrs={'class':'data-label'})
offices.append(name.text)
addresses.append(address.text)
#next row defines the dataframe with the results of the extraction
df = pd.DataFrame({'Office':offices,'Address':addresses})
#next row displays dataframe content
print(df)
#export data to .CSV file named 'branches.csv'
with open('branches.csv', 'a') as f:
df.to_csv(f, header=True)
The script seems to work correctly, at first, since Chromedriver starts and shows the results as required in the browser, but no result is returned:
Empty DataFrame
Columns: [Office, Address]
Index: []
Process finished with exit code 0
Maybe I made a mistake in choosing the selectors?
Thank you very much for your help
The problem is with the locators, use
for a in soup.findAll('li',attrs={'class':'visaATMResultListItem'}):
name = a.find('p', attrs={'class':'visaATMPlaceName '})
address = a.find('p', attrs={'class':'visaATMAddress'})
offices.append(name.text)
addresses.append(address.text)
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import time
from bs4 import BeautifulSoup
import csv
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get("https://www.visa.com/atmlocator/mobile/index.jsp#(page:results,params:(query:'Tokyo,%20JAPAN'))")
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
na = []
addr = []
for name in soup.findAll("a", {'class': 'visaATMPlaceLink'}):
na.append(name.text)
for add in soup.findAll("p", {'class': 'visaATMAddress'}):
addr.append(add.get_text(strip=True, separator=" "))
with open('out.csv', 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(['Name', 'Address'])
for _na, _addr in zip(na, addr):
writer.writerow([_na, _addr])
driver.quit()
Output: Click-Here
Related
I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:
Trying to scrape two tables on separate pages after accessing the site through a login. Tried a few different ways and can't figure it out.
The last attempt showed some promise but only the first data frame was appended to the list of data frames. Something like the following:
from selenium import webdriver
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup as BS
def text_to_chart (url, table) :
df_list = []
driver = webdriver.Chrome(path)
driver.get(login)
driver.find_element_by_xpath(password block).send_keys(password)
driver.find_element_by_xpath(username block).send_keys(username)
driver.find_element_by_xpath(submit).click()
time.sleep(10)
df = pd.DataFrame()
for url, table in zip(urls, tables) :
driver.get(url)
time.sleep(10)
soup = BS(driver.page_source, 'html')
new_table = soup.find_all('table',
attrs = {'class': table})
results_list = pd.read_html(str(new_table[0]))
df = df.append(pd.DataFrame(results_list[0]))
return df
def scrape(url, table)
df_list = []
df_list = df_list.append(text_to_chart(url, table))
scrape(url_list, table_list)
So, What Should I do to scrape multiple pages?
I suggest you must store the values in a list of dictionaries and then convert it to a dataframe.That will be good and easy.
Solved! I made a few changes which resulted in one function that created my list of df's. Then I began the session, logged in, and called the function, saving the output to my variable df_list.
from selenium import webdriver
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup as BS
def text_to_chart (urls, tables) :
df = []
for url, table in zip(urls, tables) :
driver.get(url)
time.sleep(10)
soup = BS(driver.page_source, 'html')
new_table = soup.find_all('table',
attrs = {'class': table})
results_list = pd.read_html(str(new_table[0]))
df.append(pd.DataFrame(results_list[0]))
return df
driver = webdriver.Chrome(path)
driver.get(login)
driver.find_element_by_xpath(password block).send_keys(password)
driver.find_element_by_xpath(username block).send_keys(username)
driver.find_element_by_xpath(submit).click()
time.sleep(10)
df_list = text_to_chart(url_list, table_list)
I'm trying to scrape CNBC data from its website about NASDAQ-100 by using BeautifulSoup, but when I try to change the scrape data to DataFrame it shows empty dataframe, columns:[], index:[]
Below is my code :
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
# Create parse tree for parsed pages
page=requests.get("https://www.cnbc.com/nasdaq-100")
#content=page.content
# Scrape data from specific <div> column
# Title for the data table -> NASDAQ-100
soup=BeautifulSoup(page.content,"html.parser")
l = []
title=soup.find("div",{"class":"PageHeader-main"}).find("h1").text
table=soup.find_all("table",{"class":"BasicTable-basicTable"})
for items in table:
for i in range(len(items.find_all("tr"))-1):
# Gather data
d = {}
d["stock_symbol"] = items.find_all("td", {"class":"BasicTable-symbol"})[i].find("a").text
d["stock_name"] = items.find_all("td", {"class":"BasicTable-name"})[i].text
d["price"] = items.find_all("td", {"class":"BasicTable-unchanged BasicTable-numData"})[i].text
d["price_change"] = items.find_all("td", {"class":"BasicTable-quoteDecline"})[i].text
d["percentage_change"] = items.find_all("td", {"class":"BasicTable-quoteDecline"})[i].text
# Print ("")
l.append(d)
df = pd.DataFrame(l)
print(df)
You are dealing with website which is using JavaScript to render it's data once the page loads, So we have 2 options right now.
to track the XHR request to the API where the data is being
retrieved and fetched.
to use selenium approach.
Both solutions is listed below:
import requests
import json
r = requests.get("https://quote.cnbc.com/quote-html-webservice/quote.htm?noform=1&partnerId=2&fund=1&exthrs=0&output=json&symbolType=issue&symbols=153171|172296|74548134|178129|90065764|185811|181702|3145559|8279577|8392868|196573|197784|177124|144094|205778|207106|208206|208526|217706|211573|217809|218647|25427545|223056|225584|226052|226354|90065765|227524|237331|240690|244210|253970|263397|248911|264170|256951|273612|24812378|274516|7186257|9079610|4038959|282500|21167615|282560|283581|284350|50675033|288727|288976|289807&requestMethod=extended").json()
data = json.dumps(r, indent=4)
print(data)
print(r.keys())
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import pandas as pd
from time import sleep
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get("https://www.cnbc.com/nasdaq-100")
sleep(2)
df = pd.read_html(driver.page_source)[0]
print(df)
df.to_csv("result.csv", index=False)
driver.quit()
Output: check-online
Sample:
I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?
i have successfully scrape data but i want to store it in csv here below is my code
need to write code to pull all of the html from a website then save it to a csv file.
I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.
This is what I have so far:
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
print("Error in Scraping check the url")
else:
print("Successfully scrape the data")
time.sleep(10)
print("Loading data in csv")
file = csv.writer(open('dataminer.csv', 'w'))
file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location'])
for pname in soup.find_all(class_="profile-name"):
#print(pname.text)
profname = pname.text
file.writerow([profname, ])
for cname in soup.find_all(class_="company_name"):
print(cname.text)
for salary in soup.find_all(class_="salary"):
print(salary.text)
for lpa in soup.find_all(class_="jobText"):
print(lpa.text)
for loc in soup.find_all(class_="location"):
print(loc.text)
Make a dict and save the data into it then save to csv, check below code!
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
print("Error in Srapping check the url")
else:
print("Successfully scrape the data")
for x in soup.find_all('div',attrs={'class':'job-page'}):
data.append({
'pname':x.find(class_="profile-name").text.encode('utf-8'),
'cname':x.find(class_="company_name").text.encode('utf-8'),
'salary':x.find(class_="salary").text.encode('utf-8'),
'lpa':x.find(class_="jobText").text.encode('utf-8'),
'loc':x.find(class_="location").text.encode('utf-8')})
print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.
import csv
import requests
from bs4 import BeautifulSoup
URL = "https://www.myamcat.com/jobs"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['pname','cname','salary','loc'])
for item in soup.select(".job-listing .content"):
pname = item.select_one(".profile-name h3").get_text(strip=True)
cname = item.select_one(".company_name").get_text(strip=True)
salary = item.select_one(".salary .jobText").get_text(strip=True)
loc = item.select_one(".location .jobText").get_text(strip=True)
writer.writerow([pname,cname,salary,loc])
I am trying to download the data on this website
https://coinmunity.co/
...in order to manipulate later it in Python or Pandas
I have tried to do it directly to Pandas via Requests, but did not work, using this code:
res = requests.get("https://coinmunity.co/")
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')[0]
dfm = pd.read_html(str(table), header = 0)
dfm = dfm[0].dropna(axis=0, thresh=4)
dfm.head()
In most of the things I tried, I could only get to the info in the headers, which seems to be the only table seen in this page by the code.
Seeing that this did not work, I tried to do the same scraping with Requests and BeautifulSoup, but it did not work either. This is my code:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://coinmunity.co/")
soup = BeautifulSoup(res.content, 'lxml')
#table = soup.find_all('table')[0]
#table = soup.find_all('div', {'class':'inner-container'})
#table = soup.find_all('tbody', {'class':'_ngcontent-c0'})
#table = soup.find_all('table')[0].findAll('tr')
#table = soup.find_all('table')[0].find('tbody')#.find_all('tbody _ngcontent-c3=""')
table = soup.find_all('p', {'class':'stats change positiveSubscribers'})
You can see in the lines commented, all the things I have tried, but nothing worked.
Is there any way to easily download that table to use it on Pandas/Python, in the tidiest, easier and quickest possible way?
Thank you
Since the content is loaded dynamically after the initial request is made, you won't be able to scrape this data with request. Here's what I would do instead:
from selenium import webdriver
import pandas as pd
import time
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
driver.implicitly_wait(10)
driver.get("https://coinmunity.co/")
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
results = []
for row in soup.find_all('tr')[2:]:
data = row.find_all('td')
name = data[1].find('a').text
value = data[2].find('p').text
# get the rest of the data you need about each coin here, then add it to the dictionary that you append to results
results.append({'name':name, 'value':value})
df = pd.DataFrame(results)
df.head()
name value
0 NULS 14,005
1 VEN 84,486
2 EDO 20,052
3 CLUB 1,996
4 HSR 8,433
You will need to make sure that geckodriver is installed and that it is in your PATH. I just scraped the name of each coin and the value but getting the rest of the information should be easy.