Save dictionary in Excel with Python - python

I need your help to save the data in Excel. I`ve parsed some site and I need to input dictionary in Excel.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "bzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
df = pd.DataFrame(info)
df.to_excel('./Lagos.xlsx')
I get the link to parse from list 'rest', then get data from this link. Then I want to save each item from all link to dictionary 'info'. Then save it to Excel file. But code is saving the one line to file, not the all. I`ve missed something.

You are saving df inside the loop with same name it will create only one dict(means each loop value in excel). so you better create a empty dataframe outside the loop and save it into excel file after the loop execution completed.
Your altered code will be like
all_info = pd.DataFrame()
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
if len(all_info) == 0:
all_info = pd.DataFrame(info, index=range(len(info)))
else:
all_info = all_info.append(pd.DataFrame(info))
all_info.to_excel('./Lagos.xlsx')

How about creating a list with all the data, then converting that to a dataframe and then outputting that to an Excel file.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "zzzzzzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
data = []
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
data.append(info)
df = pd.DataFrame(data)
df.to_excel('./Lagos.xlsx')

Related

How to get content by table row and if non applicable append "None"?

I have a problem when iterating through many links. I scrape according to columns css selector. however, as it seems there is among all links not a rating for every player. How do I manage that I get a "None" for the home_rating list when there is among the eleven starting squad no rating available in a specific "player row".
I basically need to scrape all column entries per row. thanks a lot for your support.
gamedays_url = range(1,35)
url_list = []
daylinks = []
for gameday in gamedays_url:
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" + str(gameday)
url_list.append(url)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
gameLinks = []
for i in range(len(url_list)):
page = url_list
tree = requests.get(page[i], headers = {'User-Agent': 'Custom5'})
soup_2 = BeautifulSoup(tree.content, 'html.parser')
links_2 = soup_2.find_all("a", {"class": "liveLink"}, href=re.compile("spielbericht"))
for j in range(len(links_2)):
gameLinks.append(links_2[j].get('href').split('/')[4])
for j in range(len(gameLinks)):
gameLinks[j] = "https://www.transfermarkt.de/spiele/aufstellung/spielbericht/" + gameLinks [j]
home_id = []
home_name = []
homerating = []
for p in range(len(gameLinks)):
page = gameLinks[p]
response = requests.get(page, headers={'User-Agent': 'Custom5'})
lineup_data = response.text
soup = BeautifulSoup(lineup_data, 'html.parser')
test =soup.find('div', class_='responsive-table')
for homeid in test.find_all('a', href=re.compile('profil/spieler')):
home_id.append(homeid.get('href').split('/')[4])
for homename in test.find_all('a', href=re.compile('profil/spieler')):
home_name.append(homename.get('href').split('/')[1])
for grade in test.find_all('span', class_=None):
homerating.append(grade.text.split()[0])
homerating.append(None)
Try to check if your selected element is available and scrape the text alse set it to None:
row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
Also try to work with structured dicts instead of list.
Example
import requests
from bs4 import BeautifulSoup
data = []
for gameday in range(1,3):
url = "https://www.transfermarkt.de/premier-league/spieltag/wettbewerb/L1/plus/?saison_id=2018&spieltag=" + str(gameday)
response = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.content)
for a in soup.select('a.liveLink[href*="spielbericht"]'):
report_url = 'https://www.transfermarkt.de/spiele/aufstellung/spielbericht/'+a.get('href').split('/')[-1]
response = requests.get(report_url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(response.text)
for row in soup.select_one('table.items').select('tr:has(table)'):
data.append({
'home_id': row.select_one('a').get('href').split('/')[-1],
'home_name': row.select_one('a img').get('title'),
'home_rating': row.select_one('span:not([class])').get_text(strip=True) if row.select('span:not([class])') else None
})
data
Output
[...{'home_id': '45672', 'home_name': 'Kevin Trapp', 'home_rating': '3,4'},{'home_id': '256866', 'home_name': 'Carlos Salcedo', 'home_rating': None},{'home_id': '58178', 'home_name': 'David Abraham', 'home_rating': '3,4'}, {'home_id': '146258', 'home_name': 'Jetro Willems', 'home_rating': '5,5'},...]

Multiple post requests within same session is not working while web scraping ASP.NET site

I am trying to scrape Time Table data from my university webpage(link provided in the code). On-webpage I have to select a course in the drop-down list and the year studying in(1-4). After selecting, I would be able to see the time table. I have tried implementing the same using python and its libraries. requests, urllib, beautifulsoup has helped me before with scraping the data with normal pages but I am not able to do so in this scenario. Please help me with this.
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import *
import requests
import json
if __name__ == "__main__":
url = "http://www.timetable.ul.ie/UA/CourseTimetable.aspx"
with requests.Session() as session:
r = session.get(url)
cookies = r.cookies
soup = BeautifulSoup(r.text, 'html.parser')
viewstate = soup.select("#__VIEWSTATE")[0]['value']
viewstategen = soup.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
headers = {'User-Agent':'Mozilla/5.0','Referer': 'https://www.timetable.ul.ie/UA/CourseTimetable.aspx'}
data = {
'__EVENTTARGET': 'ctl00$HeaderContent$CourseDropdown',
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00$HeaderContent$CourseDropdown':'LM338-Master+of+Science+in+Software+Engineering'
}
r1 = requests.post(url=url, data=json.dumps(data), cookies=cookies, headers = headers)
soup1 = BeautifulSoup(r1.text, 'html.parser')
viewstate= soup1.select("#__VIEWSTATE")[0]['value']
viewstategen = soup1.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
data = {
'__EVENTTARGET': 'ctl00$HeaderContent$CourseYearDropdown',
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00$HeaderContent$CourseYearDropdown': '1'
}
r2 = requests.post(url=url, data=json.dumps(data), cookies=cookies, headers = headers)
soup2 = BeautifulSoup(r2.text, 'html.parser')
print(soup2)```
The Output that I am getting now has all the list of courses in the university, which are there in the dropdown list.
This example will get timetable for first course and first year. You can adapt this example and get timetables for all courses/all years (loop over the courses list):
import requests
from bs4 import BeautifulSoup
url1 = "https://www.timetable.ul.ie/UA/Default.aspx"
url2 = "https://www.timetable.ul.ie/UA/CourseTimetable.aspx"
with requests.session() as s:
s.get(url1) # load cookies
soup = BeautifulSoup(s.get(url2).content, "html.parser")
data = {
"__EVENTTARGET": "ctl00$HeaderContent$CourseDropdown",
"__EVENTARGUMENT": "",
"__LASTFOCUS": "",
"ctl00$HeaderContent$CourseDropdown": "", # <-- this will be filled from `courses` list below
}
for inp in soup.select("input[value]"):
data[inp["name"]] = inp["value"]
courses = [
opt["value"]
for opt in soup.select("#HeaderContent_CourseDropdown option")
if opt["value"] != "-1"
]
# get timetable for first course:
data["ctl00$HeaderContent$CourseDropdown"] = courses[0]
soup = BeautifulSoup(s.post(url2, data=data).content, "html.parser")
# uncomment this to print all courses:
# print(*courses, sep="\n")
years = [
opt["value"]
for opt in soup.select("#HeaderContent_CourseYearDropdown option")
if opt["value"] != "-1"
]
# get timetable for first year:
data["ctl00$HeaderContent$CourseYearDropdown"] = years[0]
for inp in soup.select("input[value]"):
data[inp["name"]] = inp["value"]
soup = BeautifulSoup(s.post(url2, data=data).content, "html.parser")
# print some data:
print(
soup.select_one("table#MainContent_CourseTimetableGridView").get_text(
strip=True, separator="\n"
)
)
Prints:
Monday
Tuesday
Wednesday
Thursday
Friday
Saturday
09:00 - 10:00
AC4002 - TUT - 3J
EGAN VANESSA MS
Wks:1-9,11-13
Online Yr 1
09:00 - 10:00
EC4102 - TUT - 3J
TA1ECO
...

beautifulsoup for loop extracts only first page data

I have a txt file with 2 urls in it
https://www.kununu.com/de/volkswagen/kommentare
https://www.kununu.com/de/audi/kommentare
I want to extract some data from all pages in that urls with beautifulsoup. Below code extracts that data but only for first page. I should be missing something, can you update code so, it will extract from all pages?
firma = []
lineList2 = [line.rstrip('\n') for line in open(r"C:/myfolder/555.txt")]
print(lineList2)
for url in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'{url}/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)
from bs4 import BeautifulSoup
import requests
import pandas as pd
firma = []
lineList2=[]
with open('555.txt', 'r') as file:
lines = file.readlines()
for line in lines:
lineList2.append(line.strip('\n'))
print(lineList2)
for lurl in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print("in while")
print(f"Processing page {page}..")
url = f'{lurl}/{page}'
print(url)
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)

why am I still getting results as html using beautifulsoup?

I am writing a simple scraper for job postings, but my function extract_fulltext which is responsible for giving all the job description, however, I still got html tags as a response, so in this case. it is giving me raise InvalidSchema("No connection adapters were found for '%s'" % url) , and the full issue bug https://gist.github.com/SkyBulk/c6df488ef53ae6bc62c86670cfbd09ec
def extract_fulltext(url):
html = requests.get(url)
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html.text))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.parse.quote(job_ids)
ajax_content = requests.get(ajax_url)
soup = BeautifulSoup(ajax_content.text,"lxml")
text = soup.getText()
return soup
response = requests.get(url, headers=self.headers)
data = response.text
soup = get_soup(data)
html = soup.find_all(
name="div", attrs={"class": "row"})
for page in html:
print(page)
prefix = ['30', 'monaten', 'meses', 'luni', 'mois', 'month', 'months', 'maanden',
'mesi', 'mies.', 'm\u00e5nader', '\u043c\u0435\u0441\u044f\u0446\u0435\u0432']
date_str = extract_date(page)
s_date = date_str.replace('+', '')
match = [prefix_match for prefix_match in prefix if prefix_match in s_date]
if len(match) > 0:
pass
elif "NOT_FOUND" in s_date:
pass
else:
self.data_extracted['jobs'].append({
'job_title': extract_job_title(page),
'company': extract_company(page),
'city': extract_location(page),
'date': extract_date(page),
'cleared': extract_fulltext(page),
'url': [self.urls[country] + extract_link(page)]
})
I expect the output of {"job_id": "description"} , but the actual output is an error
You can do your concept depending on this solution
import requests,json
from bs4 import BeautifulSoup
req = requests.get('https://www.indeed.com/rpc/jobdescs?jks=80635306093cf18a,7496998d9ee18bdc')
data = json.loads(req.text)
for id in data.keys():
soup = BeautifulSoup(data[id])
print(soup.text)
Demo : Here
Simply use .get_text():
def extract_fulltext(url):
html = requests.get(url)
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html.text))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.parse.quote(job_ids)
ajax_content = requests.get(ajax_url)
soup = BeautifulSoup(ajax_content.text,"lxml")
text = soup.get_text()
return text

How to scrape the web table with multiple pages using R or Python

I want to scrape a web to gather the data for studying data mining. This web data contains a big table with 43 pages. And it also hide some stocks at the most right hand side of the expand menu.
The web page is below.
http://data.10jqka.com.cn/market/longhu/yyb/
import bs4
import requests
url = r"http://data.10jqka.com.cn/market/longhu/yyb/"
response = requests.get(url)
if response.status_code == 200:
content = response.content
soup = bs4.BeautifulSoup(content)
table_results = soup.findAll("table", {"class": "m_table"})
for item in table_results:
company_name = item.findAll("td", {"class": "tl"})[0].text.strip()
detail = item.findAll("td", {"class": "tc"})[0].text.strip()
c_rise = item.findAll("td", {"class": "c_rise"})[0].text.strip()
c_fall = item.findAll("td", {"class": "c_fall"})[0].text.strip()
cur = item.findAll("td", {"class": "cur"})[0].text.strip()
lhb_stocklist = item.findAll("div", {"class": "lhb_stocklist"})[0].text.strip()
print company_name, detail, c_rise, c_fall, lhb_stocklist
A solution based on requests, BeautifulSoup, and lxml:
import json
import requests
from bs4 import BeautifulSoup
URL = 'http://data.10jqka.com.cn/interface/market/longhuyyb/stocknum/desc/%d/20'
# config end_page as needed, or parse http://data.10jqka.com.cn/market/longhu/yyb/ to make it auto adapted
end_page = 2
result = []
for page_idx in range(1, end_page + 1):
print 'Extracting page', page_idx
raw_response = requests.get(URL % page_idx)
page_content = json.loads(raw_response.text)['data']
html = BeautifulSoup(page_content, 'lxml')
for row in html.tbody.find_all('tr'):
company = row.find(class_='tl').text
detail_link = row.find(class_='tl').a['href']
buy = float(row.find(class_='c_rise').text)
sell = float(row.find(class_='c_fall').text)
stock_cnt = int(row.find(class_='cur').text)
stocks = []
for a in row.find(class_='lhb_stocklist_box hide').p.find_all('a'):
stocks.append((a.text, a['href']))
result.append({
'company': company,
'detail_link': detail_link,
'buy': buy,
'sell': sell,
'stock_cnt': stock_cnt,
'stocks': stocks,
})
print 'Company number:', len(result)
I put all data into a list of dictionaries, for easy accessing. You can modify the codes to directly write to a CSV or whatever

Categories