import csv
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.cbssports.com/nba/stats/playersort/nba/year-2019-season-preseason-category-scoringpergame")
soup = BeautifulSoup(page.content, 'html.parser')
for record in soup.find_all('tr'):
try:
print(record.contents[0].text)
print(record.contents[6].text)
print(record.contents[7].text)
print(record.contents[8].text)
print(record.contents[9].text)
print(record.contents[10].text)
print(record.contents[12].text)
print(record.contents[13].text)
print(record.contents[14].text)
print(record.contents[15].text)
except:
pass
print('\n')
def scrape_data(url):
response = requests.get("https://www.cbssports.com/nba/stats/playersort/nba/year-2019-season-preseason-category-scoringpergame", timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[1]
rows = table.select('tbody > tr')
header = [th.text.rstrip() for th in rows[1].find_all('th')]
with open('statsoutput.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(header)
for row in rows[1:]:
data = [th.text.rstrip() for th in row.find_all('td')]
writer.writerow(data)
if __name__=="__main__":
url = "https://www.cbssports.com/nba/stats/playersort/nba/year-2019-season-preseason-category-scoringpergame"
scrape_data(url)
iv'e been trying to export the stats from this web page to a csv file.
when i'm running my code, the first part works fine and retrieves the data i want.
but the function can't export it in to a csv file and iv'e been getting this error:
table = soup.find_all('table')[1]
IndexError: list index out of range
and i'm not really sure why.
You're getting this error because this site has just one <table /> html element. So soupe.find_all() is returning a list with length 1. You might solve this error doing soupe.find_all('table')[0] or, in a clean way, soup.table.
I also checked and tested your code and recommend this:
table = soup.table
rows = table.find_all('tr')
Everything will works fine after these changes. You can check this code runing here. Hope it helps.
Related
I am scraping links from multiple pages under multiple searches and want to output scraped results into multiple .csv files. The table shows the .csv file which lists both my source urls and desired output file names:
url
outputfile
https://www.marketresearch.com/search/results.asp?categoryid=230&qtype=2&publisher=IDCs&datepub=0&submit2=Search
outputPS1xIDC.csv
https://www.marketresearch.com/search/results.asp?categoryid=90&qtype=2&publisher=IDC&datepub=0&submit2=Search
outputPS2xIDC.csv
https://www.marketresearch.com/search/results.asp?categoryid=233&qtype=2&publisher=IDC&datepub=0&submit2=Search
outputPS3xIDC.csv
https://www.marketresearch.com/search/results.asp?categoryid=169&qtype=2&publisher=IDC&datepub=0&submit2=Search
outputPS4xIDC.csv
Now, with the code below, I managed to read the urls in sequence and the rest of the code also works well (when I specify the output filename directly). However, it only outputs the last of the 4 pages in the list, so it overwrites the result each time. What I actually want for it is to output the results from the first url to the first outputfile, second to second, etc.
(Of course my actual list of source URLs is much longer than these 4).
Please help, especially with the last line, as clearly just writing [outputs] there doesn't work.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
data = []
for url in urls:
def scrape_it(url):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv([outputs], header=False) #works (but only for the last url) if instead of [outputs] I have f'filename.csv'
I don't have Pandas, and I don't really want to run your input, but a couple of things jump out a me when I look at your code:
It looks like you are not looping over url and output together. It looks like you loop over all the URLs, and then after all those loops you write once.
Likewise, data is just having the HTML table data appended and appended, it's never reset for each individual URL.
Without being able to run this, I recommend something like this. The scraping is fully encapsulated and separate from the loop, and as such you can now more clearly see the flow of inputs and outputs:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
def scrape_it(url, data):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' in stri:
data = scrape_it(nexturl, data)
return data
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
for (url, output) in zip(urls, outputs): # work on url and output together
data = scrape_it(url, [])
myOutput = pd.DataFrame(data)
myOutput.to_csv(output, header=False)
Everything works as expected when I'm using a single URL for the URL variable to scrape, but not getting any results when attempting to read links from a csv. Any help is appreciated.
Info about the CSV:
One column with a header called "Links"
300 rows of links with no space, commoa, ; or other charters before/after the links
One link in each row
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
#print(res.url)
url = res
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_= "app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_= "app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_= "app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_= "app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0,email_elm1,email_elm2,email_elm3)
print(final_email_elm)
df = pd.DataFrame(final_email_elm)
#getting an output in csv format for the dataframe we created
#df.to_csv('draft_part2_scrape.csv')
The problem lies in this part of the code:
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
...
After the loop is executed, res will have the last link. So, this program will only scrape the last link.
To solve this problem, store all the links in a list and iterate that list to scrape each of the link. You can store the scraped result in a seperate dataframe and concatenate them at the end to store in a single file:
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
links = []
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
links.append(link['Links'])
dfs = []
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_="app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_="app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_="app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_="app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0, email_elm1, email_elm2, email_elm3)
print(final_email_elm)
dfs.append(pd.DataFrame(final_email_elm))
#getting an output in csv format for the dataframe we created
df = pd.concat(dfs)
df.to_csv('draft_part2_scrape.csv')
I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?
i have successfully scrape data but i want to store it in csv here below is my code
need to write code to pull all of the html from a website then save it to a csv file.
I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.
This is what I have so far:
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
print("Error in Scraping check the url")
else:
print("Successfully scrape the data")
time.sleep(10)
print("Loading data in csv")
file = csv.writer(open('dataminer.csv', 'w'))
file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location'])
for pname in soup.find_all(class_="profile-name"):
#print(pname.text)
profname = pname.text
file.writerow([profname, ])
for cname in soup.find_all(class_="company_name"):
print(cname.text)
for salary in soup.find_all(class_="salary"):
print(salary.text)
for lpa in soup.find_all(class_="jobText"):
print(lpa.text)
for loc in soup.find_all(class_="location"):
print(loc.text)
Make a dict and save the data into it then save to csv, check below code!
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
print("Error in Srapping check the url")
else:
print("Successfully scrape the data")
for x in soup.find_all('div',attrs={'class':'job-page'}):
data.append({
'pname':x.find(class_="profile-name").text.encode('utf-8'),
'cname':x.find(class_="company_name").text.encode('utf-8'),
'salary':x.find(class_="salary").text.encode('utf-8'),
'lpa':x.find(class_="jobText").text.encode('utf-8'),
'loc':x.find(class_="location").text.encode('utf-8')})
print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.
import csv
import requests
from bs4 import BeautifulSoup
URL = "https://www.myamcat.com/jobs"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['pname','cname','salary','loc'])
for item in soup.select(".job-listing .content"):
pname = item.select_one(".profile-name h3").get_text(strip=True)
cname = item.select_one(".company_name").get_text(strip=True)
salary = item.select_one(".salary .jobText").get_text(strip=True)
loc = item.select_one(".location .jobText").get_text(strip=True)
writer.writerow([pname,cname,salary,loc])
I've made a web scraper that scrapes data from pages that look like this (it scrapes the tables): https://www.techpowerup.com/gpudb/2/
The problem is that my program, for some reason, is only scraping the values, and not the subheaders. For instance, (click on the link), it only scrapes the "R420", "130nm", "160 million", etc. but not the "GPU Name", "Process Size", "Transistors" etc.
What do I add to the code to get it to scrape the subheaders? Here's my code:
import csv
import requests
import bs4
url = "https://www.techpowerup.com/gpudb/2"
#obtain HTML and parse through it
response = requests.get(url)
html = response.content
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
soup = bs4.BeautifulSoup(html, "lxml")
tables = soup.findAll("table")
#reading every value in every row in each table and making a matrix
tableMatrix = []
for table in tables:
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
tableMatrix.append((list_of_rows, list_of_cells))
#(YOU CAN PROBABLY IGNORE THIS)placeHolder used to avoid duplicate data from appearing in list
placeHolder = 0
excelTable = []
for table in tableMatrix:
for row in table:
if placeHolder == 0:
for entry in row:
excelTable.append(entry)
placeHolder = 1
else:
placeHolder = 0
excelTable.append('\n')
for value in excelTable:
print value
print '\n'
#create excel file and write the values into a csv
fl = open(str(count) + '.csv', 'w')
writer = csv.writer(fl)
for values in excelTable:
writer.writerow(values)
fl.close()
if you check the page source, those cells are header cells. So they are not using TD tags but TH tags. you may want to update your loop to include TH cells alongside TD cells.
I am grabbing an HTML table with this code :
import csv
import urllib2
from bs4 import BeautifulSoup
with open('listing.csv', 'wb') as f:
writer = csv.writer(f)
for i in range(39):
url = "file:///C:/projects/HTML/Export.htm".format(i)
u = urllib2.urlopen(url)
try:
html = u.read()
finally:
u.close()
soup=BeautifulSoup(html)
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
row = [elem.text.encode('utf-8') for elem in tds]
writer.writerow(row)
Everything works perfectly, but I am trying to grab column 9 Href URL. It is currently giving me the txt value but not the URL.
Also, I have two tables in my HTML, anyway to skip the first table and just build the csv file using the second table?
Any help is very welcomed as I am new to Python and need this for a project I am automating a daily conversion.
Many thanks!
You should access the href attribute of the a tag within the 8th td tag:
import csv
import urllib2
from bs4 import BeautifulSoup
records = []
for index in range(39):
url = get_url(index) # where is the formatting in your example happening?
response = urllib2.urlopen(url)
try:
html = response.read()
except Exception:
raise
else:
my_parse(html)
finally:
try:
response.close()
except (UnboundLocalError, NameError):
raise UnboundLocalError
def my_parse(html):
soup = BeautifulSoup(html)
table2 = soup.find_all('table')[1]
for tr in table2.find_all('tr')[2:]:
tds = tr.find_all('td')
url = tds[8].a.get('href')
records.append([elem.text.encode('utf-8') for elem in tds])
# perhaps you want to update one of the elements of this last
# record with the found url now?
# It's more efficient to write only once
with open('listing.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(records)
I have taken the liberty to define a function get_url based on the index because your example rereads the same file every time, which is something I guess you don't actually want. I'll leave the implementation to you. Also, I've added some better exception handling.
At the same time, I've shown how you can access the 2nd table from that webpage's tables.
Was fully able to get it working with the following code:
import csv
import urllib2
from bs4 import BeautifulSoup
#Grab second table from HTML
def my_parse(html):
soup = BeautifulSoup(html)
table2 = soup.find_all('table')[1]
for tr in table2.find_all('tr')[2:]:
tds = tr.find_all('td')
url = tds[8].a.get('href')
tds[8].a.replaceWith(url)
records.append([elem.text.encode('utf-8') for elem in tds])
records = []
#Read HTML file into memory
for index in range(39):
url = "file:///C:/projects/HTML/Export.htm".format(index)
response = urllib2.urlopen(url)
try:
html = response.read()
except Exception:
raise
else:
my_parse(html)
finally:
try:
response.close()
except (UnboundLocalError, NameError):
raise UnboundLocalError
#Writing CSV file
with open('listing.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(records)
Many thanks for all the help!!!!!