Merge Python arrays in a loop? - python

I'm currently getting an output of A,A,B,B instead of A,B,A,B.
I really want to associate the values of each table header with each table data element (like a dictionary).
import requests
from bs4 import BeautifulSoup
courseCode = "IFB104"
page = requests.get("https://www.qut.edu.au/study/unit?unitCode=" + courseCode)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all(class_='table assessment-item')
numOfTables = 0
tableDataArray = []
for tbl in table:
numOfTables = numOfTables + 1
tableDataArray += [tbl.find_all('th'),tbl.find_all('td')]

If I understood correctly, you need to use dict, instead of list:
import requests
from bs4 import BeautifulSoup
courseCode = "IFB104"
page = requests.get("https://www.qut.edu.au/study/unit?unitCode=" + courseCode)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all(class_='table assessment-item')
numOfTables = 0
tableFormatted1 = []
tableFormatted2 = {}
for tbl in table:
numOfTables = numOfTables + 1
keys = tbl.find_all('th')
values = tbl.find_all('td')
new_data = dict(zip(keys, values))
# Method 1
tableFormatted1.append(new_data)
# Method 2
for k, v in new_data.items():
if k in tableFormatted2:
tableFormatted2[k].append(v)
else:
tableFormatted2[k] = [v]
print('List of dictionaries')
print(tableFormatted1)
print('')
print('Dictionary with list')
print(tableFormatted2)
Edited:
Each iteration of tbl is overwriting the iteration already done. So, it is necessary to change the structure. I've just provided two methods.

Related

Trying to get data from a table using beautifulsoup in python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

how to append multiple items to a list in python

I have a list that i want to append multiple items to it using python when i try to add the system crash and display the below error:
I already tried append() function extend() function the same crash and error.
TypeError Traceback (most recent call last)
<ipython-input-24-1e9480855366> in <module>
121
122
--> 123 joineddd.extend(link,jobdesc,alldd)
124
125
TypeError: extend() takes exactly one argument (3 given)
code:
import time
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(
requests.get("https://www.bayt.com/en/international/jobs/executive-chef-jobs/").content,
"lxml"
)
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com"+ a['href'])
joineddd = []
for link in links:
print(link)
s = BeautifulSoup(requests.get(link).content, "lxml")
jobdesc=s.select_one("div[class='card-content is-spaced'] p")
print(jobdesc.text)
alldt = [dt.text for dt in s.select("div[class='card-content is-spaced'] dt")]
dt_Job_location = alldt[0]
dt_Job_Company_Industry = alldt[1]
dt_Job_Company_Type = alldt[2]
if len(alldt[3])>0:
dt_Job_Job_Role = alldt[3]
elif len(dt_Job_Employment_Type)>0:
dt_Job_Employment_Type = alldt[4]
alldd = [dd.text for dd in s.select("div[class='card-content is-spaced'] dd")]
dd_job_location = alldd[0]
dd_job_Company_Industry = alldd[1]
dd_job_Company_Type = alldd[2]
if len(alldd[3])>0:
dd_job_Job_Role = alldd[3]
elif len(dd_job_Employment_Type)>0:
dd_job_Employment_Type = alldd[4]
print(f"{dt_Job_location}:{dd_job_location}\n{dt_Job_Company_Industry}:{dd_job_Company_Industry}\n\n")
print("-" * 80)
joineddd.extend(link,jobdesc,alldd)
expected result :
[link,description,location,Company_Industry,Company_Type,Job_Role,Employment_Type ]
You can use the list.extend method, but use square brackets around the items like so:
list = [1,2,3]
list.extend([4,5])
print(list)
>> returns 1,2,3,4,5
For Extend function you need to provide a list/touple/dict/set to add multiple items.
Like :-
joineddd.extend([link,jobdesc,alldd])
import time
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(
requests.get("https://www.bayt.com/en/international/jobs/executive-chef-jobs/").content,
"lxml"
)
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com"+ a['href'])
joineddd = []
for link in links:
print(link)
s = BeautifulSoup(requests.get(link).content, "lxml")
jobdesc=s.select_one("div[class='card-content is-spaced'] p")
print(jobdesc.text)
alldt = [dt.text for dt in s.select("div[class='card-content is-spaced'] dt")]
dt_Job_location = alldt[0]
dt_Job_Company_Industry = alldt[1]
dt_Job_Company_Type = alldt[2]
if len(alldt[3])>0:
dt_Job_Job_Role = alldt[3]
elif len(dt_Job_Employment_Type)>0:
dt_Job_Employment_Type = alldt[4]
alldd = [dd.text for dd in s.select("div[class='card-content is-spaced'] dd")]
dd_job_location = alldd[0]
dd_job_Company_Industry = alldd[1]
dd_job_Company_Type = alldd[2]
if len(alldd[3])>0:
dd_job_Job_Role = alldd[3]
elif len(dd_job_Employment_Type)>0:
dd_job_Employment_Type = alldd[4]
print(f"{dt_Job_location}:{dd_job_location}\n{dt_Job_Company_Industry}:{dd_job_Company_Industry}\n\n")
print("-" * 80)
joineddd.extend([link,jobdesc,alldd])
dataframe:
import pandas as pd
import numpy as np
array1 = ["value1", "value2"]
array2 = ["value1"]
df = dict( A = np.array(array1), B = np.array(array2 ) )
_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in df.items() ]))
print(_df)
_df.to_csv("filename.csv", index=False, encoding="utf-8")
output:
A B
0 value1 value1
1 value2 NaN

Troubles appending list to a DataFrame

I experience dealing with multi tags/attributes in one loop and appending them to the DataFrame. More speicifcally, it concerns Place loop:
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
Appending it to the DataFrame yields only 1 result out of expected 30.
Thank you in advance.
import requests
import bs4
import pandas as pd
frames = []
for pagenumber in range (0,2):
url = 'https://www.marktplaats.nl/l/auto-s/p/'
txt = requests.get(url + str(pagenumber))
soup = bs4.BeautifulSoup(txt.text, 'html.parser')
soup_table = soup.find('ul', 'mp-Listings mp-Listings--list-view')
for car in soup_table.findAll('li'):
link = car.find('a')
sub_url = 'https://www.marktplaats.nl/' + link.get('href')
sub_soup = requests.get(sub_url)
sub_soup_txt = bs4.BeautifulSoup(sub_soup.text, 'html.parser')
soup1 = sub_soup_txt.find('div', {'id': 'car-attributes'})
soup2 = sub_soup_txt.find('div', {'id': 'vip-seller'})
tmp = []
places = []
for car_item in soup1.findAll('div', {'class': 'spec-table-item'}):
key = car_item.find('span', {'class': 'key'}).text
value = car_item.find('span', {'class': 'value'}).text
tmp.append([key, value])
for car_item in soup2.findAll('ul', {'class': 'seller-info-links'}):
place = car_item.find('h3', {'class':'heading'}).text.strip()
places.append(place)
frames.append(pd.DataFrame(tmp).set_index(0))
df_final = pd.concat((tmp_df for tmp_df in frames), axis=1, join='outer').reset_index()
df_final = df_final.T
df_final.columns = df_final.loc["index"].values
df_final.drop("index", inplace=True)
df_final.reset_index(inplace=True, drop=True)
df_final['Places'] = pd.Series(places)
df_final.to_csv('auto_database.csv')
As you are adding places to the final df, this line (currently sitting in for pagenumber in ... for car in ...)
places = []
should go all the way up and out of the main for loop here:
frames = []
places = []

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]
You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].
You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict
Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

I am trying to scrape data but it gets data only of 10 pages whereas there are 26 pages

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page=1&q=laptop&sid=6bo%2Fb5g&viewType=list")
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("div",{"class":"col _2-gKeQ"})
page_nr=soup.find_all("a",{"class":"_33m_Yg"})[-1].text
print(page_nr,"number of pages were found")
#all[0].find("div",{"class":"_1vC4OE _2rQ-NK"}).text
l=[]
base_url="https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page=1&q=laptop&sid=6bo%2Fb5g&viewType=list"
for page in range(0,int(page_nr)*10,10):
print( )
r=requests.get(base_url+str(page)+".html")
c=r.content
#c=r.json()["list"]
soup=BeautifulSoup(c,"html.parser")
for item in all:
d ={}
#price
d["Price"] = item.find("div",{"class":"_1vC4OE _2rQ-NK"}).text
#Name
d["Name"] = item.find("div",{"class":"_3wU53n"}).text
for li in item.find_all("li",{"class":"_1ZRRx1"}):
if " EMI" in li.text:
d["EMI"] = li.text
else:
d["EMI"] = None
for li1 in item.find_all("li",{"class":"_1ZRRx1"}):
if "Special " in li1.text:
d["Special Price"] = li1.text
else:
d["Special Price"] = None
for val in item.find_all("li",{"class":"tVe95H"}):
if "Display" in val.text:
d["Display"] = val.text
elif "Warranty" in val.text:
d["Warrenty"] = val.text
elif "RAM" in val.text:
d["Ram"] = val.text
l.append(d)
import pandas
df = pandas.DataFrame(l)
This might work on standard pagination
i = 1
items_parsed = set()
loop = True
base_url = "https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page={}&q=laptop&sid=6bo%2Fb5g&viewType=list"
while True:
page = requests.get(base_url.format(i))
items = requests.get(#yourelements#)
if not items:
break
for item in items:
#Scrap your item and once you sucessfully done the scrap, return the url of the parsed item into url_parsed (details below code) for example:
url_parsed = your_stuff(items)
if url_parsed in items_parsed:
loop = False
items_parsed.add(url_parsed)
if not loop:
break
i += 1
I formatted your URL where ?page=X with base_url.format(i) so it can iterate until you have no items found on the page OR sometimes you return on page 1 when you reached max_page + 1.
If above the maximum page you get the items you already parsed on the first page you can declare a set() and put the URL of every items you parsed and then check if you already parsed them.
Note that this is just an idea.
Since the page number in the URL is almost in the middle I'd apply a similar change to your code:
base_url="https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page="
end_url ="&q=laptop&sid=6bo%2Fb5g&viewType=list"
for page in range(1, page_nr + 1):
r=requests.get(base_url+str(page)+end_url+".html")
You have access to only first 10 pages from initial URL.
You can make a loop from "&page=1" to "&page=26".

Categories