Store web scraping data in mongodb atlas - python

I'm a new beginner. I successfully scraped data from a website and put it into a json file. I would like to put the data in mongo atlas. I hid the 'user-agent' and 'my cononection string'. I created an account on Altas and tried to find the related code to import data to altas. How can I utilize the code at the bottom correctly?
import requests
from bs4 import BeautifulSoup
import json
import pymongo
mystocks = ['^GSPC', 'QQQ', 'TSLA', 'AAPL']
stockdata = []
def getData(symbol):
headers = {
'User-Agent': 'my-user-agent'
}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
# created a dictionary
stock = {
'symbol' : symbol,
'price' : soup.find('fin-streamer', {'class': "Fw(b) Fz(36px) Mb(-4px) D(ib)"}).text,
'change' : soup.find('fin-streamer', {'class': 'Fw(500) Pstart(8px) Fz(24px)'}).text,
'change_percentage' : soup.find('div', {'class': 'D(ib) Mend(20px)'}).find_all('span')[1].text,
}
return stock
for i in mystocks:
stockdata.append(getData(i))
print('Getting: ', i)
# with open('stockdata.json', 'w') as f:
# json.dump(stockdata, f)
# print('Finish')
client = pymongo.MongoClient('my cononection string')
db = client.db.stock
try:
db.insert_many(stock)
print(f'inserted {len(stock)} articles')
except:
print('an error occurred quotes were not stored to db')
I am not sure how to use the code at the bottom.

Related

How to scrape stock data incorporating pagination next tag using python bs4?

The code can not get the next page, it only repeats in an infinite loop. I am using the example from oxylabs
Could you tell me what I'm doing wrong? Thank you.
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
url = 'https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html'
while True:
response = requests.get(url)
soup = bs(response.content, "lxml")
symbols = soup.find_all('td', class_='STOCK_CODE' )
for s in symbols:
symbol = s.find('a').text
print(symbol)
next_page = soup.select_one('span', id = 'next')
if next_page:
next_url = next_page.get('href')
url = urljoin(url, next_url)
else:
break
print(url)
The information you want for the other pages is being returned via another call. You need to recreate that call (use your browser's network tools to see what is happening).
The request requires a token that is returned when the homepage is requested. This needs to be provided when requesting the other pages.
For example:
from bs4 import BeautifulSoup as bs
import requests
session = requests.Session()
req_homepage = session.get('https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html')
soup_homepage = bs(req_homepage.content, "lxml")
for meta in soup_homepage.find_all('meta'):
if meta.get('name', None) == '__RequestVerificationToken':
token = meta['content']
data = {
"p_issearch" : 0,
"p_keysearch" : "",
"p_market_code" : "",
"p_orderby" : "STOCK_CODE",
"p_ordertype" : "ASC",
"p_currentpage" : 2,
"p_record_on_page" : 10,
}
headers = {
"Referer" : "https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html",
"__RequestVerificationToken" : token,
"X-Requested-With" : "XMLHttpRequest",
}
for page in range(1, 4):
print(f"Page {page}")
data['p_currentpage'] = page
req = session.post('https://hnx.vn/ModuleIssuer/List/ListSearch_Datas', data=data, headers=headers)
json_content = req.json()['Content']
soup = bs(json_content, "lxml")
for td in soup.find_all('td', class_='STOCK_CODE'):
symbol = td.find('a').text
print(' ', symbol)
This would give you the following output:
Page 1
AAV
ACM
ADC
ALT
AMC
AME
AMV
API
APP
APS
Page 2
ARM
ART
ATS
BAB
BAX
BBS
BCC
BCF
BDB
BED
Page 3
BII
BKC
BLF
BNA
BPC
BSC
BST
BTS
BTW
BVS

Fix BeautifulSoup code to get data from all pages and output into csv

Complete beginner. Please help. I've got this code, which worked when I did not try to output to .csv but instead had a print command there - so I didn't have the last 2 lines or anything related to variable 'data'. By 'worked' I mean it printed data from all 18 pages.
Now it outputs data into .csv but only from the first page (url).
I see that I'm not passing nexturl into the pandas at the end - because I don't know how to. Help greatly appreciated.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r'
def scrape_it(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
data = []
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
return data
myOutput = pd.DataFrame(scrape_it(url))
myOutput.to_csv(f'results-tec6.csv', header=False)
Make data global so you keep appending to it during loop rather than re-creating afresh. Then make your recursive function be called outside the DataFrame() call so you can then pass data to pandas.
Finally, you can pass a cookie to get the max possible results per request to reduce the number of requests.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r&page=1'
data = []
def scrape_it(url):
page = requests.get(url, headers = {'Cookie':'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv(f'results-tec6.csv', header=False)

Save dictionary in Excel with Python

I need your help to save the data in Excel. I`ve parsed some site and I need to input dictionary in Excel.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "bzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
df = pd.DataFrame(info)
df.to_excel('./Lagos.xlsx')
I get the link to parse from list 'rest', then get data from this link. Then I want to save each item from all link to dictionary 'info'. Then save it to Excel file. But code is saving the one line to file, not the all. I`ve missed something.
You are saving df inside the loop with same name it will create only one dict(means each loop value in excel). so you better create a empty dataframe outside the loop and save it into excel file after the loop execution completed.
Your altered code will be like
all_info = pd.DataFrame()
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
if len(all_info) == 0:
all_info = pd.DataFrame(info, index=range(len(info)))
else:
all_info = all_info.append(pd.DataFrame(info))
all_info.to_excel('./Lagos.xlsx')
How about creating a list with all the data, then converting that to a dataframe and then outputting that to an Excel file.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "zzzzzzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
data = []
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
data.append(info)
df = pd.DataFrame(data)
df.to_excel('./Lagos.xlsx')

Unable to scrape this site. How to scrape data from this site?

Iam not able to scrape data from this site.
I tried with other sites but it's ok with other sites...
from bs4 import BeautifulSoup
from urllib.request import urlopen
response = urlopen("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1")
html = response.read()
parsed_html = BeautifulSoup(html, "html.parser")
containers = parsed_html.find_all("div", {"class" : "c2prKC"})
print(len(containers))
Look like JS render to page after loading .You can use Selenium to render the page and beautiful soup to get the element.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1")
time.sleep(5)
html = driver.page_source
parsed_html = BeautifulSoup(html, "html.parser")
containers = parsed_html.find_all("div", {"class" : "c2prKC"})
print(len(containers))
Info you want is in a script tag. You can use regex or loop script tags to get the right string to parse as json (with a small amendment)
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers)
soup = bs(res.content, 'lxml')
for script in soup.select('script'):
if 'window.pageData=' in script.text:
script = script.text.replace('window.pageData=','')
break
items = json.loads(script)['mods']['listItems']
results = []
for item in items:
#print(item)
#extract other info you want
row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']]
results.append(row)
df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating'])
print(df.head())
Regex version:
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?spm=a2a0e.searchlistcategory.search.2.3eac4b8amQJ0zd&q=samsung%20m20&_keyori=ss&from=suggest_normal&sugg=samsung%20m20_1_1', headers = headers)
soup = bs(res.content, 'lxml')
r = re.compile(r'window.pageData=(.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
items = json.loads(script)['mods']['listItems']
results = []
for item in items:
row = [item['name'], item['priceShow'], item['productUrl'], item['ratingScore']]
results.append(row)
df = pd.DataFrame(results, columns = ['Name', 'Price', 'ProductUrl', 'Rating'])
print(df.head())
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
headers = {
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.daraz.com.np/catalog/?q=camera&_keyori=ss&from=input&spm=a2a0e.searchlist.search.go.71a64360Kgxf1m', headers = headers)
soup = bs(res.content, 'lxml')
scriptData=''
for d in containerSearch:
if 'window.pageData=' in str(d):
scriptData=str(d).replace('window.pageData=','')
break
scriptData=scriptData.replace('<script>','')
scriptData=scriptData.replace('</script>','')
items = json.loads(scriptData)
name=items['mods']['listItems'][0]['name']
image=items['mods']['listItems'][0]['image']
price=items['mods']['listItems'][0]['price']
priceShow=items['mods']['listItems'][0]['priceShow']
ratingScore=items['mods']['listItems'][0]['ratingScore']
productUrl=items['mods']['listItems'][0]['productUrl']
print(name)
print(price)

isolate 'td a' tag based on class using beautiful soup

I'd like to write the url links in this url into a file but there are 2 'td a' tags for each line on the table. I just want the one where a class="pagelink" href="/search" etc.
I tried the following code, hoping to pick up only the ones where "class":"pagelink", but produced an error:
AttributeError: 'Doctype' object has no attribute 'find_all'
Can anyone help please?
import requests
from bs4 import BeautifulSoup as soup
import csv
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
url = https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=1000&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results
response = session.get(url) #not used until after the iteration begins
html = soup(response.text, 'lxml')
for link in html:
prop_link = link.find_all("td a", {"class":"pagelink"})
writer.writerow([prop_link])
Your html variable contains a Doctype object which is not iterable.
You'll need to use find_all or select in that object to find the nodes that you want.
Example:
import requests
from bs4 import BeautifulSoup as soup
import csv
outputfilename = 'Ed_Streets2.csv'
#inputfilename = 'Edinburgh.txt'
baseurl = 'https://www.saa.gov.uk'
outputfile = open(outputfilename, 'wb')
writer = csv.writer(outputfile)
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
session = requests.session()
url = "https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=100&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results"
response = session.get(url)
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", class_="pagelink button small")
for link in prop_link:
prop_url = baseurl+(link["href"])
print prop_url
writer.writerow([prop_url, "", "", ""])
Try this.
You need to look for the links before starting the loop.
import requests
from bs4 import BeautifulSoup as soup
import csv
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
url = "https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=1000&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results"
response = requests.get(url) #not used until after the iteration begins
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", {"class":"pagelink button small"})
for link in prop_link:
if(type(link) != type(None) and link.has_attr("href")):
wr = link["href"]
writer.writerow([wr])

Categories