I'm newbie on Python and BeautifulSoup, I would like to scrape multiple pages in csv but when I'm trying to store those 3 links only the last one it's stored in csv.
How can I fix my issue ?
## importing bs4, requests, fake_useragent and csv modules
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import csv
## create an array with URLs
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
## initializing the UserAgent object
user_agent = UserAgent()
## starting the loop
for url in urls:
## getting the reponse from the page using get method of requests module
page = requests.get(url, headers={"user-agent": user_agent.chrome})
## storing the content of the page in a variable
html = page.content
## creating BeautifulSoup object
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
rows = table.findAll("tr")
with open("test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in rows:
csv_row = []
for cell in row.findAll(["td", "th"]):
csv_row.append(cell.get_text())
writer.writerow(csv_row)
Thanks a lot !
To simplify the reading process of the rows, you could also give a shot with pandas:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
all_data = []
for url in urls:
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
df_table = pd.read_html(str(table))[0]
#add a column with additional info
df_table['hit'] = soup.find("span", {"class":"c"}).text.strip()
#store the table in a list of tables
all_data.append(df_table)
#concat the tables and export them to csv
pd.concat(all_data).to_csv('test.csv',index=False)
In your code, you don't store rows variable to anywhere, so you write only values from your last URL to CSV file. This example will write values from all three URLs:
import csv
import requests
from bs4 import BeautifulSoup
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
all_data = []
for url in urls:
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
# here I store all rows to list `all_data`
for row in table.findAll('tr'):
tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
all_data.append(tds)
print(*tds)
# write list `all_data` to CSV
with open("test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in all_data:
writer.writerow(row)
Writes test.csv from all three URLs (screenshot from LibreOffice):
Related
I'm relatively new to web scrapping , I used Selenium and beautiful soup to srcape data however I'm unable to, Can someone help get the table data from the following link or any way to download the CSV file in Python please?
'''
print("Start")
from nsetools import Nse
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import urllib.request
nse_web = "https://www.nseindia.com/market-data/new-52-week-high-low-equity-market"
req = urllib.request.Request(
nse_web,
data=None,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'}
)
#f = urllib.request.urlopen(req)
#nse_web = "https://www.nseindia.com/market-data/new-52-week-high-low-equity-market"
time.sleep(5)
html = urlopen(req)
print("open URL")
time.sleep(10)
bsObj = BeautifulSoup(html, features="lxml")
print("before_table")
time.sleep(15)
data = []
table = soup.find('table', attrs={'class':'common_table customHeight-table tableScroll alt_row w-100'})
print(table)
table_body = table.find('tbody')
print(table_body)
rows = table_body.find_all('tr')
Print(rows)
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
print(data)
print("process complete")
'''
It seems that you are able to get the data from the table as a list of rows. To make a csv file you need to use python csv module.
import csv
with open('result.csv', 'wb') as csv_file:
csv_obj = csv.writer(csv_file)
for row in data:
csv_obj.writerow(row)
you can obtain your results.csv file in your current directory.
I am new to Python and learning data analysis. I am trying to scrape data from this web page: https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7
I am able to scrape data with simple websites but I think since BitInfoCharts has tables it may be a more complex HTML setup than the tutorials I am following.
My goal is to scrape the data from the table which includes Block, Time, Amount, Balance, ect and have it in a csv file. I previously tried using pandas but found that it was difficult to select the data I want from the HTML.
To do this, I think that what I need to do is get the header/table information from the "class="table abtb tablesorter tablesorter-default" and then pull all of the information from each object inside that class that contains "class="trb". The class=trb changes from page to page (Example, one person may have 7 transactions, and another may have 40). I am not exactly sure though as this is new territory for me.
I would really appreciate any help.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content)
table = soup.find_all("table_maina")
print(table)
If you do decide to do it manually, this does the same thing:
import csv
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content,'lxml')
table = soup.find(id="table_maina")
headers = []
datarows = []
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append( [td.text for td in row.find_all('td')] )
fcsv = csv.writer( open('x.csv','w',newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
There is only one table element called 'table_maina' so you should call find() vs find_all(). Also, you need you specify the "table" tag as first argument in find() function.
Try:
table = soup.find('table', id='table_maina')
for tr in table.find_all('tr', class_='trb'):
print(tr.text)
Output:
4066317 2022-01-17 15:41:22 UTC2022-01-17 15:41:22 UTC-33,000,000 DOGE (5,524,731.65 USD)220,000,005.04121223 DOGE$36,831,545 # $0.167$-28,974,248
4063353 2022-01-15 11:04:46 UTC2022-01-15 11:04:46 UTC+4,000,000 DOGE (759,634.87 USD)253,000,005.04121223 DOGE$48,046,907 # $0.19$-23,283,618
...
Next, to output each row into CSV file then try this:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find("table", id='table_maina')
with open('out.csv', 'w', newline='') as fout:
csv_writer = csv.writer(fout)
csv_writer.writerow(['Block', 'Time', 'Amount', 'Balance', 'Price', 'Profit'])
for tr in table.find_all('tr', class_='trb'):
tds = tr.find_all('td')
csv_writer.writerow([x.text for x in tds])
Output:
Block,Time,Amount,Balance,Price,Profit
4066317 2022-01-17 15:41:22 UTC,2022-01-17 15:41:22 UTC,"-33,000,000 DOGE (5,524,731.65 USD)","220,000,005.04121223 DOGE","$36,831,545 # $0.167","$-28,974,248"
...
I have a list of urls in a csv file that I want to scrape content from. The csv has 200 plus urls. The code that I'm running is picking the first url and then failing. Here is the code:
import csv
from selenium import webdriver
with open('Godzilla1.csv', 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["Title", "Content"])
f = open("links.csv")
urls = [url.strip() for url in f.readlines()]
driver = webdriver.Firefox()
for url in urls:
driver.get(url)
titles = driver.find_elements_by_xpath('//h2[#class="entry-title"]')
contents = driver.find_elements_by_class_name("et_pb_post")
num_page_items = len(titles)
with open('Godzilla1.csv', 'a') as f:
for i in range(num_page_items):
f.write(titles[i].text + "," + contents[i].text + "\n")
# Clean up (close browser once completed task).
driver.close()
When that code runs the error reported is: f.write(titles[i].text + "," + contents[i].text + "\n")
IndexError: list index out of range
The problem is while you get 2 items in titles, there's only 1 element in contents. So as you iterate to the 2nd item in title, the content is out of range (hence the error). It appears the title content repeats twice, so rather than get all elements with entry-title class, just get the first element.
You're also going to run into issues here using the , as you delimiter, as there are commas in the content. Can I suggest just using pandas?
import pandas as pd
from selenium import webdriver
f = open("links.csv")
urls = [url.strip() for url in f.readlines()]
driver = webdriver.Firefox()
rows = []
for url in urls:
driver.get(url)
title = driver.find_element_by_xpath('//h2[#class="entry-title"]')
content = driver.find_element_by_class_name("et_pb_post")
row = {'Title':title.text,
'Content':content.text}
rows.append(row)
# Clean up (close browser once completed task).
driver.close()
df = pd.DataFrame(rows)
df.to_csv('Godzilla1.csv', index=False)
There is also the option to avoid Selenium and simply use requests and BeautifulSoup:
import pandas as pd
import requests
from bs4 import BeautifulSoup
f = open("links.csv")
urls = [url.strip() for url in f.readlines()]
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}
rows = []
for url in urls:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h2',{'class':"entry-title"})
content = soup.find('div',{'class':'entry-content'}).find('p')
post_meta = soup.find('p', {'class':'post-meta'})
try:
category = post_meta.find('a',{'rel':'category tag'}).text.strip()
except:
category = ''
row = {'Title':title.text,
'Content':content.text,
'Category':category}
print(row)
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('Godzilla1.csv', index=False)
A year ago I learned some python in one of my classes but haven't had to use much since then so this may or may not be a simple question.
I'm trying to web-scrape the top grossing films of all time table from Box Office Mojo and I want to grab the rank, title, and gross for the top 10 films in the 2010s. I've been playing around in python and I can get the entire table into python but I don't know how to manipulate it from there, let alone write out a csv file. Any guidance/tips?
Here is what will print the entire table for me (the first few lines are copied from an old web-scraping assignment to get me started):
import bs4
import requests
from bs4 import BeautifulSoup as soup
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
page_html = requests.get(url, headers=headers)
page_soup = soup(page_html.text, "html.parser")
boxofficemojo_table = page_soup.find("div", {"class": "a-section imdb-scroll-table-inner"})
complete_table = boxofficemojo_table.get_text()
print(complete_table)`
You Can use pd.read_html for this.
import pandas as pd
Data = pd.read_html(r'https://www.boxofficemojo.com/chart/top_lifetime_gross/')
for data in Data:
data.to_csv('Data.csv', ',')
2.Using Bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests
URL = r'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
print('\n>> Exctracting Data using Beautiful Soup for :'+ URL)
try:
res = requests.get(URL)
except Exception as e:
print(repr(e))
print('\n<> URL present status Code = ',(res.status_code))
soup = BeautifulSoup(res.text,"lxml")
table = soup.find('table')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Data.dropna(axis = 0, how = 'all',inplace = True)
print(Data.head(10))
Data.to_csv('Table.csv')
A few months into python, and am having trouble scraping some information from tables using BeautifulSoup, any help would be appreciated. I am not getting any error codes, but instead just receiving no data from the table.
import bs4 as bs
import requests
resp = requests.get('https://www.thestreet.com/markets/gainers.html')
soup = bs.BeautifulSoup(resp.text, "lxml")
table = soup.find('table', {'id': 'nyseData'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[1].text
tickers.append(ticker)
Any help is much appreciated!
You are running in to a problem with the page not allowing certain user-agents from accessing their site. This can be fixed by setting a user-agent string in your requests header.
Your code with the user-agent added:
import bs4 as bs
import requests
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
resp = requests.get('https://www.thestreet.com/markets/gainers.html', headers=headers)
soup = bs.BeautifulSoup(resp.text,'lxml')
table = soup.find('table', {'id': 'nyseData'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[1].text
tickers.append(ticker)
print tickers
Output:
[u'QUOT', u'BCEI', u'ATEN', u'SKX', u'FBK', u'FBM', u'CGI', u'SDRL', u'ELLI', u'CELP', u'SXCP', u'CUB', u'GLF', u'SID', u'HBM', u'NE', u'CBG', u'PJT', u'VVI', u'ARL']