Create a loop by iterating a string throughout a code - python

I have the following code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests import get
date = []
tourney_round = []
result = []
winner_odds = []
loser_odds = []
surface = []
players_and_tourney
response = get('http://www.tennisexplorer.com/player/humbert-e2553/?annual=all')
page_html = BeautifulSoup(response.text, 'html.parser')
results2018_containers = page_html.find_all('div', id = 'matches-2018-1-data')
for container in results2018_containers:
played_date_2018 = results2018_containers[0].findAll('td', class_ = 'first time')
for i in played_date_2018:
date.append(i.text)
string_2018 = '2018'
date = [x + string_2018 for x in date]
for container in results2018_containers:
rounds_2018 = results2018_containers[0].findAll('td', class_ = 'round')
for i in rounds_2018:
tourney_round.append(i.text)
for container in results2018_containers:
results_2018 = results2018_containers[0].findAll('td', class_ = 'tl')
for i in results_2018:
result.append(i.text)
for container in results2018_containers:
surfaces_2018 = results2018_containers[0].findAll('td', class_ = 's-color')
for i in surfaces_2018:
surface.append(i.find('span')['title'])
for container in results2018_containers:
odds_2018 = results2018_containers[0].findAll('td', class_ = 'course')
winner_odds_2018 = odds_2018[0:][::2]
for i in winner_odds_2018:
winner_odds.append(i.text)
loser_odds_2018 = odds_2018[1:][::2]
for i in loser_odds_2018:
loser_odds.append(i.text)
for container in results2018_containers:
namesandtourney_2018 = results2018_containers[0].findAll('td', class_ = 't-name')
for i in namesandtourney_2018:
players_and_tourney.append(i.text)
from itertools import chain, groupby, repeat
chainer = chain.from_iterable
def condition(x):
return x.startswith('\xa0')
elements = [list(j) for i, j in groupby(players_and_tourney, key=condition) if not i]
# create list of headers
headers = [next(j) for i, j in groupby(players_and_tourney, key=condition) if i]
# chain list of lists, and use repeat for headers
initial_df_2018 = pd.DataFrame({'Date': date,
'Surface': surface,
'Players': list(chainer(elements)),
'Tournament': list(chainer(repeat(i, j) for i, j in \
zip(headers, map(len, elements)))),
'Round': tourney_round,
'Result': result,
'Winner Odds': winner_odds,
'Loser Odds' : loser_odds})
initial_df_2018['Winner'], initial_df_2018['Loser'] =
initial_df_2018['Players'].str.split(' - ', 1).str
del initial_df_2018['Players']
initial_df_2018 = initial_df_2018[['Date','Surface','Tournament','Winner','Loser','Result','Winner Odds','Loser Odds']]
I want to create a loop that runs the code for every year starting from 2005. So basically, running a loop by replacing 2018 throughout the code by each year between 2005 an 2018. If possible, the code would run first for the year 2018, then 2017, and so on until 2005.
Edit: I added the code that i used to pull data for the year 2018, but I want to have a loop that will pull data for all the years that can be found on the page.

If I understood you correctly you want to complete the request for 2018, for all years between 2005-2018.
What I did was loop over your code for years in those range, replacing the id each time and adding all data to the list.
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
date_dict = {}
for year in range(2019, 1, -1):
date = []
string_id = "played-{}-data".format(year)
results_containers = page_html.find_all('div', id = string_id)
if (results_containers == None):
continue
for container in results_containers :
played_date = results_containers [0].findAll('td', class_ = 'plays')
for i in played_date :
date.append(i.text)
if not (year in date_dict):
date_dict[year] = []
date_dict[year] += date

You can store the year as an integer but still use it in a string.
for year in range(2018, 2004, -1):
print(f"Happy New Year {year}")
Other ways to include a number in a string are "Happy New Year {}".format(year) or "it is now " + str(year) + " more text".
Also, I don't think you do, but if someone finds this and really wants to "iterate a string" caesar ciphers are a good place to look.

There's no problem looping that, but you need to define how you want your results. I used a dictionary here, and i've turned your code into a function that I can call with variables:
def get_data(year):
date =[]
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
results_containers = page_html.find_all('div', id = 'played-{year}-data'.format(year))
for container in results_containers:
played_date = results_containers[0].findAll('td', class_ = 'plays')
for i in played_date:
date.append(i.text)
return date
Now all i have to do is create a range of possible years and call the function every time, this can be done as simply as:
all_data = {year: get_data(year) for year in range(2018, 2004, -1)}

Just use a for loop over a range. Something like:
date =[]
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
for year in range(2018, 2004, -1):
year_id = 'played-{}-data'.format(year)
results_containers = page_html.find_all('div', id=year_id)
...

Related

Create a for loop to webscrape multiple pages from multiple URLs using beautifulsoup

I am trying to scrape multiple pages from multiple URLS efficiently. I have been able to scrape multiple pages from one URL successfully, but unable to implement this for multiple URLs. Any and help would be greatly appreciated. Thank you.
Current Loop Code:
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
while True:
(r := session.get(f'{URL}{page+1}')).raise_for_status()
m = re.search('.*page=(\d+)$', r.url)
if m and int(m.group(1)) == page:
break
page += 1
print(f'Scrapping page {page}')
Desired URL Loop:
The only thing being changed for each url is the 1-r, 2-r, 3-r section. The total number of URLS is 5.
URL = [f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=',
f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
]
Full Code:
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import pandas as pd
import re
import csv
today = datetime.today().strftime('%y%m%d ')
def main():
page = 0
name = []
date = []
address = []
district = []
city = []
price = []
area_sqm = []
rooms = []
floor = []
commission_year = []
building_floors = []
garage = []
balcony = []
windows = []
window_type = []
floor_type = []
door_type = []
leasing = []
description = []
link = []
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
while True:
(r := session.get(f'{URL}{page+1}')).raise_for_status()
m = re.search('.*page=(\d+)$', r.url)
if m and int(m.group(1)) == page:
break
page += 1
print(f'Scrapping page {page}')
soup = BS(r.text, 'lxml')
for tag in soup.findAll('div', class_='list-announcement-block'):
_name = tag.find('a', attrs={'itemprop': 'name'})
name.append(_name.get('content', 'N/A'))
if (_link := _name.get('href', None)):
link.append(f'{BASE}{_link}')
(_r := session.get(link[-1])).raise_for_status()
_spanlist = BS(_r.text, 'lxml').find_all('span', class_='value-chars')
floor_type.append(_spanlist[0].get_text().strip())
balcony.append(_spanlist[1].get_text().strip())
garage.append(_spanlist[2].get_text().strip())
window_type.append(_spanlist[3].get_text().strip())
door_type.append(_spanlist[4].get_text().strip())
windows.append(_spanlist[5].get_text().strip())
_alist = BS(_r.text, 'lxml').find_all('a', class_='value-chars')
commission_year.append(_alist[0].get_text().strip())
building_floors.append(_alist[1].get_text().strip())
area_sqm.append(_alist[2].get_text().strip())
floor.append(_alist[3].get_text().strip())
leasing.append(_alist[4].get_text().strip())
district.append(_alist[5].get_text().strip())
address.append(_alist[6].get_text().strip())
rooms.append(tag.find('div', attrs={'announcement-block__breadcrumbs'}).get_text().split('ยป')[1].strip())
description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
city.append((tag.find('meta', attrs={'itemprop': 'areaServed'})).get('content'))
if (_price := tag.find('meta', attrs={'itemprop': 'price'})) is None:
_price = tag.find('div', class_='announcement-block__price _premium')
price.append(_price.get_text().strip() if _price else 'N/A')
df = pd.DataFrame(zip(name, date, address, district, city,
price, area_sqm, rooms, floor, commission_year,
building_floors, garage, balcony, windows, window_type,
floor_type, door_type, leasing, description, link), columns=COLUMNS)
return(df)
if __name__ == '__main__':
df = main()
df.to_csv(f'{today}HPD.csv', encoding='cp1251', errors='ignore', index=False)
You can combine for loops with Python's range() function.
The range() function provides a sequence of integers based upon the function's arguments.
range(start, stop[, step])
The start argument is the first value in the range. If range() is called with only one argument, then Python assumes start = 0.
The stop argument is the upper bound of the range. It is important to realize that this upper value is not included in the range.
Example:
for i in range(1, 6):
BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{i}-r/?page='
print(URL)
Output:
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page=

Webscraping different URLs - limit

I have coded a web scraper for auto trader but for some reason when iterating through urls I can only ever get a maximum length of 1300 for my dataframe. There are 13 results per page so is there some sort of significance about a limit of 100 or am I just doing something wrong? Any help would be greatly appreciated :)
I've attached my code below
# Import required libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# List of urls
path = 'https://www.autotrader.co.uk/car-search?advertClassification=standard&postcode=RH104JJ&make=&price-from=500&price-to=100000&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&advertising-location=at_cars&is-quick-search=TRUE&page='
urls = []
for i in range(1,500):
url = path + str(i)
urls.append(url)
# Lists to store the scraped data in
makes = []
prices = []
ratings = []
dates = []
types = []
miles = []
litres = []
bhps = []
transmissions = []
fuels = []
owners = []
attributes = [makes, ratings, dates, types, miles, litres, bhps, transmissions, fuels, owners]
# Iterate through urls
sum = 0
for url in urls:
sum += 1
if sum%10 == 0:
print(sum)
# Attempt to connect to the url
try:
response = get(url)
except:
print('oops')
html_soup = BeautifulSoup(response.text, 'html.parser')
# Get a list of individual cars and iterate through it
car_containers = html_soup.find_all('li', class_ = 'search-page__result')
for container in car_containers:
try:
container.find("div", {"class": "js-tooltip"}).find("div", {"class": "pi-indicator js-tooltip-trigger"}).text
rating = container.find("div", {"class": "js-tooltip"}).find("div", {"class": "pi-indicator js-tooltip-trigger"}).text.strip()
except:
rating = ''
ratings.append(rating)
make = container.h2.text.strip().title().split(' ')[0]
makes.append(make)
price = container.find("div", {"class": "vehicle-price"}).text[1:]
prices.append(price)
specs = container.find("ul", {"class": "listing-key-specs"}).find_all("li", recursive=True)
for spec in specs:
if spec.text.split(' ')[0].isdigit() and len(spec.text.split(' ')[0]) == 4:
date = spec.text.split(' ')[0]
dates.append(date)
if 'mile' in str(spec):
mile = spec.text.split(' ')[0]
miles.append(mile)
if 'l' in str(spec).lower() and str(spec.text)[:-1].replace('.', '').isnumeric() and not spec.text.split(' ')[0].isdigit():
litre = spec.text[:-1]
litres.append(litre)
if any(x in str(spec).lower() for x in ['automatic', 'manual']):
transmission = spec.text
transmissions.append(transmission)
if any(x in str(spec).lower() for x in ['bhp', 'ps']):
bhp = spec.text
bhps.append(bhp)
if any(x in str(spec).lower() for x in ['petrol', 'diesel']):
fuel = spec.text
fuels.append(fuel)
if 'owner' in str(spec):
owner = spec.text
owners.append(owner.split(' ')[0])
typelist = ['hatchback', 'saloon', 'convertible', 'coupe', 'suv', 'mpv', 'estate', 'limousine',
'pickup']
if any(x in str(spec).lower() for x in typelist):
typ = spec.text
types.append(typ)
# Filling in empty spaces
for attribute in attributes:
if len(attribute) < len(prices):
attribute.append('')
# Creating a dataframe from the lists
df = ({'makes': makes,
'Price': prices,
'Rating': ratings,
'Year': dates,
'Type': types,
'Miles': miles,
'Litres': litres,
'BHP': bhps,
'Transmission': transmissions,
'Fuel': fuels,
'Owners': owners
})
df = pd.DataFrame(df)
Maybe just use a url shortener if the length of the url is too long

Troubles appending list to a DataFrame

I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []

BeautifulSoup get links and info inside of them

I would like to scrape a website. Website has 10 preview of complaints in each page. I wrote this script to get links of 10 complaints and some info inside of each link. When I run the script I got this error message "RecursionError: maximum recursion depth exceeded".
Can someone say to me what is the problem. Thank you in advance!!
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Create list objects for each information section
C_date = []
C_title = []
C_text = []
U_name = []
U_id = []
C_count = []
R_name = []
R_date = []
R_text = []
# Get 10 links for preview of complaints
def getLinks(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
c_containers = html_soup.find_all('div', class_='media')
# Store wanted links in a list
allLinks = []
for link in c_containers:
find_tag = link.find('a')
find_links = find_tag.get('href')
full_link = "".join((url, find_links))
allLinks.append(full_link)
# Get total number of links
print(len(allLinks))
return allLinks
def GetData(Each_Link):
each_complaint_page = get(Each_Link)
html_soup = BeautifulSoup(each_complaint_page.text, 'html.parser')
# Get date of complaint
dt = html_soup.main.find('span')
date = dt['title']
C_date.append(date)
# Get Title of complaint
TL = html_soup.main.find('h1', {'class': 'title'})
Title = TL.text
C_title.append(Title)
# Get main text of complaint
Tx = html_soup.main.find('div', {'class': 'description'})
Text = Tx.text
C_text.append(Text)
# Get user name and id
Uname = html_soup.main.find('span', {'class': 'user'})
User_name = Uname.span.text
User_id = Uname.attrs['data-memberid']
U_name.append(User_name)
U_id.append(User_id)
# Get view count of complaint
Vcount = html_soup.main.find('span', {'view-count-detail'})
View_count = Vcount.text
C_count.append(View_count)
# Get reply for complaint
Rpnm = html_soup.main.find('h4', {'name'})
Reply_name = Rpnm.next
R_name.append(Reply_name)
# Get reply date
Rpdt = html_soup.main.find('span', {'date-tips'})
Reply_date = Rpdt.attrs['title']
R_date.append(Reply_date)
# Get reply text
Rptx = html_soup.main.find('p', {'comment-content-msg company-comment-msg'})
Reply_text = Rptx.text
R_text.append(Reply_text)
link_list = getLinks('https://www.sikayetvar.com/arcelik')
for i in link_list:
z = GetData(i)
print(z)
PS: My next step will be to put all information in a data frame
Your GetData() method calls itself, with no base-case: this causes infinite recursion:
def GetData(data):
for i in GetData(data):
You're also calling response = get(i) but then ignoring the result... perhaps you meant to say
def GetData(link):
i = get(link)
...

Python scrape, skipping a <tr> tag and row

Scraping a webpage and encountering an "IndexError: list index out of range"
pretty sure it's because a row in the table I am scraping is using as a header - http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow-20161205.html?mod=mdc_pastcalenda
from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
import re
import datetime
date = datetime.datetime.today()
url = "http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow- 20161205.html?mod=mdc_pastcalendar"
date_time = urlopen(url.format(date=date.strftime('%Y%m%d')))
address = url
print 'Retrieving information from: ' + address
print '\n'
soup = BeautifulSoup (requests.get(address).content, "lxml")
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
rows = table_one.findAll('tr')
if len(soup.findAll('tr')) > 0:
rows = rows[2:]
#print rows
for row in rows:
cells = row.findAll('td')
name = cells[0].get_text()
last = cells[1].get_text()
chg = cells[2].get_text()
pct_chg = cells[3].get_text()
money_flow = cells[4].get_text()
tick_up = cells[5].get_text()
tick_down = cells[6].get_text()
up_down_Ratio = cells[7].get_text()
money_flow = cells[8].get_text()
tick_up = cells[9].get_text()
tick_down = cells[10].get_text()
up_down_Ratio = cells[11].get_text()
The intermediate rows with single cells like "Dow Jones U.S. Total Stock Market Sectors" is the reason you are having this error.
But, instead, why don't you pre-define a list of headers and dynamically create a dictionary from the values of the "data" rows zipping with the list of headers:
rows = soup.select('div#column0 table tr')[2:]
headers = ['name', 'last', 'chg', 'pct_chg',
'total_money_flow', 'total_tick_up', 'total_tick_down', 'total_up_down_ratio',
'block_money_flow', 'block_tick_up', 'block_tick_down', 'block_up_down_ratio']
for row in rows:
# skip non-data rows
if row.find("td", class_="pnum") is None:
continue
print(dict(zip(headers, [cell.get_text(strip=True) for cell in row.find_all('td')])))
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
# to id the right row
def target_row(tag):
is_row = len(tag.find_all('td')) > 5
row_name = tag.name == 'tr'
return is_row and row_name
rows = table_one.find_all(target_row)
for row in rows:
cells = row.findAll('td')
name = cells[0].get_text()
last = cells[1].get_text()
chg = cells[2].get_text()
pct_chg = cells[3].get_text()
money_flow = cells[4].get_text()
tick_up = cells[5].get_text()
tick_down = cells[6].get_text()
up_down_Ratio = cells[7].get_text()
money_flow = cells[8].get_text()
tick_up = cells[9].get_text()
tick_down = cells[10].get_text()
up_down_Ratio = cells[11].get_text()
you can use a function that return a bool as find's parameter, in this way, you code is much clean and maintainable.

Categories