Web Scraping from an API Loop - python

I'm scraping from the World Bank for a paper and I'm trying to make a loop of the web scraping of different indicators but I can't seem to make it work until a certain part of the code. Hope someone can help please?
#Single Code for each indicator
indcator = 'SP.POP.TOTL?date=2000:2020'
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % indicator
response = requests.get(url)
print(response)
result = response.content
result = json.loads(result)
pop_total_df = pd.DataFrame.from_dict(result[1])
This is the loop i'm trying to build but I got an error in the last part of below code:
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(3):
result_json.append(json.loads(result_list[i])))

As you are making 2 requests (FP.CPI.TOTL.ZG?date=2000:2020 and SP.POP.TOTL?date=2000:2020) your result_list length is 2, so its index are 0 and 1. Use range(2) or range(len(result_list)) instead:
import requests, json
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(len(result_list)):
result_json.append(json.loads(result_list[i]))

Related

using beautiful soup to get consolidated data from a list of urls instead of just the first url

I'm trying get the data of three states, based on the same url format.
states = ['123', '124', '125']
urls = []
for state in states:
url = f'www.something.com/geo={state}'
urls.append(url)
and from there I have three separate urls, each containing different state ID.
However when I get to processing it via BS, the output only showed data from the state 123.
for url in urls:
client = ScrapingBeeClient(api_key="API_KEY")
response = client.get(url)
doc = BeautifulSoup(response.text, 'html.parser')
subsequently I extracted the columns I wanted using this:
listings = doc.select('.is-9-desktop')
rows = []
for listing in listings:
row = {}
try:
row['name'] = listing.select_one('.result-title').text.strip()
except:
print("no name")
try:
row['add'] = listing.select_one('.address-text').text.strip()
except:
print("no add")
try:
row['mention'] = listing.select_one('.review-mention-block').text.strip()
except:
pass
rows.append(row)
But as mentioned it only showed data for state 123. Hugely appreciate it if anyone could let me know where I went wrong, thank you!
EDIT
I added the URL output into a list, and was able to get the data for all three states.
doc = []
for url in urls:
client = ScrapingBeeClient(api_key="API_KEY")
response = client.get(url)
docs = BeautifulSoup(response.text, 'html.parser')
doc.append(docs)
However when I ran it through BS it resulted in the error message:
Attribute Error: 'list' object has no attribute select.
Do I run it through another loop?
It does not need all of these loops - Just iterate over the states and get the listings to append to rows.
The most important thing is that rows=[] is placed outside the for loops to stop it overwriting itself.
Example
states = ['123', '124', '125']
rows = []
for state in states:
url = f'www.something.com/geo={states}'
client = ScrapingBeeClient(api_key="API_KEY")
response = client.get(url)
doc = BeautifulSoup(response.text, 'html.parser')
listings = doc.select('.is-9-desktop')
for listing in listings:
row = {}
try:
row['name'] = listing.select_one('.result-title').text.strip()
except:
print("no name")
try:
row['add'] = listing.select_one('.address-text').text.strip()
except:
print("no add")
try:
row['mention'] = listing.select_one('.review-mention-block').text.strip()
except:
pass
rows.append(row)

Create a for loop to webscrape multiple pages from multiple URLs using beautifulsoup

I am trying to scrape multiple pages from multiple URLS efficiently. I have been able to scrape multiple pages from one URL successfully, but unable to implement this for multiple URLs. Any and help would be greatly appreciated. Thank you.
Current Loop Code:
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
while True:
(r := session.get(f'{URL}{page+1}')).raise_for_status()
m = re.search('.*page=(\d+)$', r.url)
if m and int(m.group(1)) == page:
break
page += 1
print(f'Scrapping page {page}')
Desired URL Loop:
The only thing being changed for each url is the 1-r, 2-r, 3-r section. The total number of URLS is 5.
URL = [f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
]
Full Code:
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import pandas as pd
import re
import csv
today = datetime.today().strftime('%y%m%d ')
def main():
page = 0
name = []
date = []
address = []
district = []
city = []
price = []
area_sqm = []
rooms = []
floor = []
commission_year = []
building_floors = []
garage = []
balcony = []
windows = []
window_type = []
floor_type = []
door_type = []
leasing = []
description = []
link = []
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
while True:
(r := session.get(f'{URL}{page+1}')).raise_for_status()
m = re.search('.*page=(\d+)$', r.url)
if m and int(m.group(1)) == page:
break
page += 1
print(f'Scrapping page {page}')
soup = BS(r.text, 'lxml')
for tag in soup.findAll('div', class_='list-announcement-block'):
_name = tag.find('a', attrs={'itemprop': 'name'})
name.append(_name.get('content', 'N/A'))
if (_link := _name.get('href', None)):
link.append(f'{BASE}{_link}')
(_r := session.get(link[-1])).raise_for_status()
_spanlist = BS(_r.text, 'lxml').find_all('span', class_='value-chars')
floor_type.append(_spanlist[0].get_text().strip())
balcony.append(_spanlist[1].get_text().strip())
garage.append(_spanlist[2].get_text().strip())
window_type.append(_spanlist[3].get_text().strip())
door_type.append(_spanlist[4].get_text().strip())
windows.append(_spanlist[5].get_text().strip())
_alist = BS(_r.text, 'lxml').find_all('a', class_='value-chars')
commission_year.append(_alist[0].get_text().strip())
building_floors.append(_alist[1].get_text().strip())
area_sqm.append(_alist[2].get_text().strip())
floor.append(_alist[3].get_text().strip())
leasing.append(_alist[4].get_text().strip())
district.append(_alist[5].get_text().strip())
address.append(_alist[6].get_text().strip())
rooms.append(tag.find('div', attrs={'announcement-block__breadcrumbs'}).get_text().split('ยป')[1].strip())
description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
city.append((tag.find('meta', attrs={'itemprop': 'areaServed'})).get('content'))
if (_price := tag.find('meta', attrs={'itemprop': 'price'})) is None:
_price = tag.find('div', class_='announcement-block__price _premium')
price.append(_price.get_text().strip() if _price else 'N/A')
df = pd.DataFrame(zip(name, date, address, district, city,
price, area_sqm, rooms, floor, commission_year,
building_floors, garage, balcony, windows, window_type,
floor_type, door_type, leasing, description, link), columns=COLUMNS)
return(df)
if __name__ == '__main__':
df = main()
df.to_csv(f'{today}HPD.csv', encoding='cp1251', errors='ignore', index=False)
You can combine for loops with Python's range() function.
The range() function provides a sequence of integers based upon the function's arguments.
range(start, stop[, step])
The start argument is the first value in the range. If range() is called with only one argument, then Python assumes start = 0.
The stop argument is the upper bound of the range. It is important to realize that this upper value is not included in the range.
Example:
for i in range(1, 6):
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{i}-r/?page='
print(URL)
Output:
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page=

How to scrape multiple pages using the apple i tunes api

I'm trying to scrape the Itunes API to get information for all of the podcasts available in the Apple iTunes store. Currently, I'm only able to pull 200 at a time. When I try to grab the next 200 podcasts in the list, I'm getting the same 200 as before.
https://itunes.apple.com/search?term=podcast&limit=2
https://itunes.apple.com/search?term=podcast&limit=2&offset=1
Any suggestions would be appreciated.
import requests
import pandas as pd
import time
import json
url = 'https://itunes.apple.com/search?term=podcast&limit=2'
res = requests.get(url,headers={'User-agent': 'project'})
res.status_code
current_url = None
posts = []
the_offset = 0
for _ in range(2):
if current_url == None:
current_url = url
else:
current_url = url +'&offset={}'.format(the_offset)
res = requests.get(current_url)
if res.status_code != 200:
print('Error',res.status_code)
break
the_offset += 1
current_dict = res.json()
current_posts = {k:v for (k,v) in current_dict.items()}
posts.extend(current_posts['results'])
print(current_url)
time.sleep(3)
Try changing the offset parameter:
results = 100
limit = 10
pages = int(results / limit)
for i in pages:
offset = i+1
request(offset)

BeautifulSoup get links and info inside of them

I would like to scrape a website. Website has 10 preview of complaints in each page. I wrote this script to get links of 10 complaints and some info inside of each link. When I run the script I got this error message "RecursionError: maximum recursion depth exceeded".
Can someone say to me what is the problem. Thank you in advance!!
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Create list objects for each information section
C_date = []
C_title = []
C_text = []
U_name = []
U_id = []
C_count = []
R_name = []
R_date = []
R_text = []
# Get 10 links for preview of complaints
def getLinks(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
c_containers = html_soup.find_all('div', class_='media')
# Store wanted links in a list
allLinks = []
for link in c_containers:
find_tag = link.find('a')
find_links = find_tag.get('href')
full_link = "".join((url, find_links))
allLinks.append(full_link)
# Get total number of links
print(len(allLinks))
return allLinks
def GetData(Each_Link):
each_complaint_page = get(Each_Link)
html_soup = BeautifulSoup(each_complaint_page.text, 'html.parser')
# Get date of complaint
dt = html_soup.main.find('span')
date = dt['title']
C_date.append(date)
# Get Title of complaint
TL = html_soup.main.find('h1', {'class': 'title'})
Title = TL.text
C_title.append(Title)
# Get main text of complaint
Tx = html_soup.main.find('div', {'class': 'description'})
Text = Tx.text
C_text.append(Text)
# Get user name and id
Uname = html_soup.main.find('span', {'class': 'user'})
User_name = Uname.span.text
User_id = Uname.attrs['data-memberid']
U_name.append(User_name)
U_id.append(User_id)
# Get view count of complaint
Vcount = html_soup.main.find('span', {'view-count-detail'})
View_count = Vcount.text
C_count.append(View_count)
# Get reply for complaint
Rpnm = html_soup.main.find('h4', {'name'})
Reply_name = Rpnm.next
R_name.append(Reply_name)
# Get reply date
Rpdt = html_soup.main.find('span', {'date-tips'})
Reply_date = Rpdt.attrs['title']
R_date.append(Reply_date)
# Get reply text
Rptx = html_soup.main.find('p', {'comment-content-msg company-comment-msg'})
Reply_text = Rptx.text
R_text.append(Reply_text)
link_list = getLinks('https://www.sikayetvar.com/arcelik')
for i in link_list:
z = GetData(i)
print(z)
PS: My next step will be to put all information in a data frame
Your GetData() method calls itself, with no base-case: this causes infinite recursion:
def GetData(data):
for i in GetData(data):
You're also calling response = get(i) but then ignoring the result... perhaps you meant to say
def GetData(link):
i = get(link)
...

Execute loop based on a list - obtain result for each page (subpage)

I am trying to obtain the number of pages for each url from a list of urls. My code works as long as I have only one url, however as soon as I try it with a list of urls I only get the rest from one url. Guess the problem is related to my loop. given that I am new to python and beautifoul soup I dont manage to spot the mistake myself.
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
urls=[]
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
for i in range (1,last_page):
page = main_page.format(i)
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
urls = base_url + link.find('a').get('href')+"/-/p/{}"
print(urls)
So far, everything works, I obtain the max page number and get all the urls from each page. The problem lies in the code below (I believe):
for url in urls: #to loop through the list of urls
r = requests.get(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
anzahl=max(tes_int)
print(anzahl)
I am trying to apply the same concept as in the code above for each url from the list urls- but instead of obtaining the max page number for each url I obtain 241 every time, as if I am caught in a loop...
Any thoughts on that? Help is highly appreciated.
You're equating urls to last link generated by loop.
To build valid list of urls you need to replace = on append():
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try:
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
print(urls)
EDIT: okay, as far as I see you have several issues in your code. along with my initial fix I'm outlining my vision and understanding of how your code desired to work:
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try: #also adding try-except for escaping broken/unavailable links
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
urls = list(set(urls)) #check and drop duplicated in links list
for url in urls: #to loop through the list of urls
try:
r = requests.get(url.format(0))
print(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
except:
print('broken link')
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
# print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
try:
anzahl=max(tes_int)
print(anzahl)
except:
print('maximum cannot be calculated')

Categories