How to download files based on a list of links - python

I am currently learning webscraping and Python. I want to write a code that downloads a list of .xls data files based on a list of links that I have created. Each of these links downloads a data file that corresponds to FDI flows of a country.
My problem is that with the way the code is currently written, the last url in my list replaces all the previous files. The files are named correctly but they all contain the data for the last country in the list. As an example, I am only taking the last three countries in the data.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
page = requests.get("https://unctad.org/en/Pages/DIAE/FDI%20Statistics/FDI-Statistics-Bilateral.aspx")
soup = BeautifulSoup(page.text, 'html.parser')
countries_list = soup.select('[id=FDIcountriesxls] option[value]')
links = [link.get('value') for link in countries_list[203:-1]] #sample of countries
countries = [country.text for country in countries_list[203:-1]] #sample of countries
links_complete = ["https://unctad.org" + link for link in links]
for link in links_complete:
for country in countries:
r=requests.get(link)
with open (country + '.xls', 'wb') as file:
file.write(r.content)
What this gets me is three files, all named after the three countries but containing the data for the last (Zambia).
Can anyone help with this?
Thanks.

That's because you don't have to do a double loop.
Indeed, in the "countries" loop, you rewrite each time on your file ('wb') so there are only the values of the last country left.
To solve your problem you can do a loop on countries_list directly
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
page = requests.get("https://unctad.org/en/Pages/DIAE/FDI%20Statistics/FDI-Statistics-Bilateral.aspx")
soup = BeautifulSoup(page.text, 'html.parser')
countries_list = soup.select('[id=FDIcountriesxls] option[value]')
for opt in countries_list:
value = opt.get('value')
if value:
link = "https://unctad.org" + value
country = opt.get_text()
r = requests.get(link)
with open(country + '.xls', 'wb') as file:
file.write(r.content)

Related

Trying to use BeautifulSoup to scrape yelp ratings and export to csv, the csv though ONLY has the review comments and not rating or ID

I'm trying to scrape 100 reviews/ratings on a Yelp restaurant for an assignment using BeautifulSoup. I'm specifically looking for:
Review Comment
Review ID
Review Rating
I'm pretty new to Python and I feel like I've missed something extremely obvious
Here's what I've got so far:
from bs4 import BeautifulSoup
import urllib.request
url = 'https://www.yelp.com/biz/ichiran-times-square-new-york-4?osq=Ichiban+Ramen' ourUrl = urllib.request.urlopen(url)
soup = BeautifulSoup(ourUrl,'html.parser') type(soup) print(soup.prettify())
for i in soup.find_all('div', {'class':" arrange-unit__373c0__3XPkE arrange-unit-fill__373c0__38Zde border-color--default__373c0__r305k"}): ID.append(i.find("div").get("aria-label"))
soup.find('p', {'class':"comment__373c0__Nsutg css-n6i4z7"})
i = soup.find('p', {'class':"comment__373c0__Nsutg css-n6i4z7"}) i.text
review=[]
rating = []
ID = []
for x in range(0,10):
url = "https://www.yelp.com/biz/ichiran-times-square-new-york-4?osq=Ichiban+Ramen="+str(10*x)
ourUrl = urllib.request.urlopen(url)
soup = BeautifulSoup(ourUrl,'html.parser')
#for i in soup,
for i in soup.find_all('div', {'class':" i-stars__373c0___sZu0 i-stars--regular-5__373c0__20dKs border-color--default__373c0__1yxBb overflow--hidden__373c0__1TJqF"}):
per_rating = i.text
rating.append(per_rating)
for i in soup.find_all('span', {'class':" arrange-unit__373c0__3XPkE arrange-unit-fill__373c0__38Zde border-color--default__373c0__r305k"}):
ID.append(i.find("div").get("aria-label"))
for i in soup.find_all('p', {'class':"comment__373c0__Nsutg css-n6i4z7"}):
per_review=i.text
review.append(per_review)
len(review)
Here's my attempt at exporting to csv where I get review text ONLY and nothing else:
with open('Review.csv','a',encoding = 'utf-8') as f:
for each in review:
f.write(each+'\n')
Edit - Updated
The issue actually looks due to not targeting the correct tags in the HTML.
# Import regex package
import re
# Narrow down the section that you are searching in to avoid erroneous elements
child = soup.find('div', {'class': 'css-79elbk border-color--default__373c0__1ei3H'})
for x in child.find_all('span', {'class':"fs-block css-m6anxm"}):
# Ignore the titular "Username"
if x.text != 'Username':
ID.append(x.text)
for x in child.find_all('div', {'class':re.compile(r'i-stars.+')}):
rating.append(x.get('aria-label'))
for x in child.find_all('p', {'class':'comment__373c0__Nsutg css-n6i4z7'}):
comment = x.find('span', {'class':'raw__373c0__tQAx6'})
review.append(comment.text)
The ID needed to target the specific element, 'class':"fs-block css-m6anxm", and the rating class differed depending on how many stars it achieved so implementing regex to identify anything beginning with i-stars.
Original Answer
I believe your issue is that you are only looping through review when you also need to loop ID and rating also: -
# Create new_line to work around f-strings issue with '\'
new_line = '\n'
with open('Review.csv','a',encoding = 'utf-8') as f:
for i in range(len(review):
f.write(f'{review[i]},{ID[i]},{rating[i]}{new_line}')
You could also take a look at the Pandas package in order to achieve this.
You can create a dataframe and then export that as a number of different file types, including CSV, for example: -
# Import Pandas package
import Pandas
# Store list values, along with column headings, in a dictionary
d = {'review_comment': review, 'review_id': ID, 'review_rating': rating}
# Create dataframe from the dictionary
df = pd.DataFrame(data=d)
# Export the dataframe as a CSV
df.to_csv('desired/save/location.csv', index=False)

How to extract html text output as list for each input from list using python web scraping. I have written code, but gives only first entry output

I am new to python and programming. I am trying to extract pubchem ID from database called IMPAAT(https://cb.imsc.res.in/imppat/home). I have a list of chemical ids from the database for a herb, where going into each chemical ID hyperlink gives details on its pubchem ID and smiles data.
I have written a script in python to take each chemical ID as input and look for pubchem ID from the html page and print output to a text file using API web scraping method.
I am finding it difficult to get all the data as output. Pretty sure there is some error in the for loop as it prints only the first output many times, instead of the different output for each input.
Please help with this.
Also, I dont know how to save this kind of file where it prints input and corresponding output side by side. Please help.
import requests
import xmltodict
from pprint import pprint
import time
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
from pathlib import Path
from tqdm.notebook import tqdm
cids = 'output.txt'
df = pd.read_csv(cids, sep='\t')
df
data = []
for line in df.iterrows():
out = requests.get(f'https://cb.imsc.res.in/imppat/Phytochemical-detailedpage-auth/CID%{line}')
soup = BeautifulSoup(out.text, "html.parser")
if soup.status_code == 200:
script_data = soup.find('div', {'class': 'views-field views-field-Pubchem-id'}).find('span', {'class': 'field-content'}).find('h3')
#print(script_data.text)
for text in script_data:
texts = script_data.get_text()
print(text)
data.append(text)
print(data)
****
input file consists of
cids
0 3A155934
1 3A117235
2 3A12312921
3 3A12303662
4 3A225688
5 3A440966
6 3A443160 ```
There are few things you need to correct in your code.
Incorrect indentation of out variable.
Status Code should be checked on response object i.e., out not soup.
You don't need second loop as each response contains only single pubchem ID which you are already collecting in script_data variable.
Lastly, you can use pandas to associate each chemical ID to its pubchem ID and then can write to CSV file.
Refer to below code for complete result.
Code
import requests
import xmltodict
from pprint import pprint
import time
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
from pathlib import Path
from tqdm.notebook import tqdm
cids = 'output.txt'
df = pd.read_csv(cids, sep='\t')
pubchem_id= []
for line in df.iterrows():
out = requests.get(f'https://cb.imsc.res.in/imppat/Phytochemical-detailedpage-auth/CID%{line}')
if out.status_code == 200:
soup = BeautifulSoup(out.text, "html.parser")
script_data = soup.find('div', {'class': 'views-field views-field-Pubchem-id'}).find('span', {'class': 'field-content'}).find('h3').getText()
script_data = script_data.replace('PubChem Identifier:','')
pubchem_id.append(script_data)
# As you have not mentioned column index of cids, I am assuming it should be the first column
df1 = pd.DataFrame({"chemical_id": df.iloc[:, 0].tolist(), "pubchem_id": pubchem_id})
print(df1)
# uncomment below line to write the dataframe into csv files & replace 'filename' by the complete filepath
# df1.to_csv('filename.csv')

How can I get the data of this table from HackerRank and filter it by country of origin and score for then exporting it as a csv file?

I'm learning web scraping on Python and I decided to test my skills in the HackerRank Leaderboard page, so I wrote the code below expecting no errors before adding the country restriction to the tester function for then exporting my csv file successfully.
But then the Python console replied:
AttributeError: 'NoneType' object has no attribute 'find_all'
The error above corresponds to the line 29 from my code (for i in table.find_all({'class':'ellipsis'}):), so I decided to come here in order to ask for assistance, I'm afraid there could be more syntax or logic errors, so it's better to get rid of my doubts by getting a feedback from experts.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from time import sleep
from random import randint
pd.set_option('display.max_columns', None)
#Declaring a variable for looping over all the pages
pages = np.arange(1, 93, 1)
a = pd.DataFrame()
#loop cycle
for url in pages:
#get html for each new page
url ='https://www.hackerrank.com/leaderboard?page='+str(url)
page = requests.get(url)
sleep(randint(3,10))
soup = BeautifulSoup(page.text, 'lxml')
#get the table
table = soup.find('header', {'class':'table-header flex'})
headers = []
#get the headers of the table and delete the "white space"
for i in table.find_all({'class':'ellipsis'}):
title = i.text.strip()
headers.append(title)
#set the headers to columns in a new dataframe
df = pd.DataFrame(columns=headers)
rows = soup.find('div', {'class':'table-body'})
#get the rows of the table but omit the first row (which are headers)
for row in rows.find_all('table-row-wrapper')[1:]:
data = row.find_all('table-row-column ellipsis')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
#set the data of the Txn Count column to float
Txn = df['SCORE'].values
#combine all the data rows in one single dataframe
a = a.append(pd.DataFrame(df))
def tester(mejora):
mejora = mejora[(mejora['SCORE']>2250.0)]
return mejora.to_csv('new_test_Score_Count.csv')
tester(a)
Do you guys have any ideas or suggestions that could fix the problem?
the error states, that you table element is None. i'm guessing here but you cant get the table from the page with bs4 because it is loaded after with javascript. I would recommend to use selenium for this instead

How to Loop and Save Data from Each Iteration

I am trying to learn how to scrape data from a webpage in python and am running into trouble with how to structure my nested loops in python. I received some assistance in how I was scraping with this question (How to pull links from within an 'a' tag). I am trying to have that code essentially iterate through different weeks (and eventually years) of webpages. What I have currently is below, but it is not iterating through the two weeks I would like it to and saving it off.
import requests, re, json
from bs4 import BeautifulSoup
weeks=['1','2']
data = pd.DataFrame(columns=['Teams','Link'])
scripts_head = soup.find('head').find_all('script')
all_links = {}
for i in weeks:
r = requests.get(r'https://www.espn.com/college-football/scoreboard/_/year/2018/seasontype/2/week/'+i)
soup = BeautifulSoup(r.text, 'html.parser')
for script in scripts_head:
if 'window.espn.scoreboardData' in script.text:
json_scoreboard = json.loads(re.search(r'({.*?});', script.text).group(1))
for event in json_scoreboard['events']:
name = event['name']
for link in event['links']:
if link['text'] == 'Gamecast':
gamecast = link['href']
all_links[name] = gamecast
#Save data to dataframe
data2=pd.DataFrame(list(all_links.items()),columns=['Teams','Link'])
#Append new data to existing data
data=data.append(data2,ignore_index = True)
#Save dataframe with all links to csv for future use
data.to_csv(r'game_id_data.csv')
Edit: So to add some clarification, it is creating duplicates of the data from one week and repeatedly appending it to the end. I also edited the code to include the proper libraries, it should be able to be copy and pasted and run in python.
The problem is in your loop logic:
if 'window.espn.scoreboardData' in script.text:
...
data2=pd.DataFrame(list(all_links.items()),columns=['Teams','Link'])
#Append new data to existing data
data=data.append(data2,ignore_index = True)
Your indentation on the last line is wrong. As given, you append data2 regardless of whether you have new scoreboard data. When you don't, you skip the if body and simply append the previous data2 value.
So the workaround I came up with is below, I am still getting duplicate game ID's in my final dataset, but at least I am looping through the entire desired set and getting all of them. Then at the end I dedupe.
import requests, re, json
from bs4 import BeautifulSoup
import csv
import pandas as pd
years=['2015','2016','2017','2018']
weeks=['1','2','3','4','5','6','7','8','9','10','11','12','13','14']
data = pd.DataFrame(columns=['Teams','Link'])
all_links = {}
for year in years:
for i in weeks:
r = requests.get(r'https://www.espn.com/college-football/scoreboard/_/year/'+ year + '/seasontype/2/week/'+i)
soup = BeautifulSoup(r.text, 'html.parser')
scripts_head = soup.find('head').find_all('script')
for script in scripts_head:
if 'window.espn.scoreboardData' in script.text:
json_scoreboard = json.loads(re.search(r'({.*?});', script.text).group(1))
for event in json_scoreboard['events']:
name = event['name']
for link in event['links']:
if link['text'] == 'Gamecast':
gamecast = link['href']
all_links[name] = gamecast
#Save data to dataframe
data2=pd.DataFrame(list(all_links.items()),columns=['Teams','Link'])
#Append new data to existing data
data=data.append(data2,ignore_index = True)
#Save dataframe with all links to csv for future use
data_test=data.drop_duplicates(keep='first')
data_test.to_csv(r'all_years_deduped.csv')

web scraping table from multiple pages from a search and creating a pandas dataframe

I got this code working for the first page and needed the user agent as it didn't work otherwise.
The problem I get is the search brings the first page, but on the second you have "page=2" and continuing so need to scrape all or as much as needed from the search
"https://www.vesselfinder.com/vessels?page=2&minDW=20000&maxDW=300000&type=4"
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
site= "https://www.vesselfinder.com/vessels?type=4&minDW=20000&maxDW=300000"
hdr = {'User-Agent': 'Chrome/70.0.3538.110'}
req = Request(site,headers=hdr)
page = urlopen(req)
import pandas as pd
import numpy as np
soup = BeautifulSoup(page, 'lxml')
type(soup)
rows = soup.find_all('tr')
print(rows[:10])
for row in rows:
row_td = row.find_all('td')
print(row_td)
type(row_td)
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantext)
import re
list_rows = []
for row in rows:
cells = row.find_all('td')
str_cells = str(cells)
clean = re.compile('<.*?>')
clean2 = (re.sub(clean, '',str_cells))
list_rows.append(clean2)
print(clean2)
type(clean2)
df = pd.DataFrame(list_rows)
df.head(10)
df1 = df[0].str.split(',', expand=True)
df1.head(10)
Output is a Pandas DataFrame
need to scrape all pages to output a large dataframe
Okay, so this problem ended up getting stuck in my head, so I worked it out.
import pandas as pd
import requests
hdr={'User-Agent':'Chrome/70.0.3538.110'}
table_dfs={}
for page_number in range(951):
http= "https://www.vesselfinder.com/vessels?page={}&minDW=20000&maxDW=300000&type=4".format(page_number+1)
url= requests.get(http,headers=hdr)
table_dfs[page_number]= pd.read_html(url.text)
it will return the first column (vessel) as a nan value. That's the column for the image, ignore it if you don't need it.
the next column will be called 'built' it has the ships name, and type of ship in it. You'll need to .split() to separate them, and then you can replace column(vessel) with the ships name.
If it works for you I'd love to boost my reputation with a nice green check mark.
rows = soup.find_all('tr')
print(rows[:10])
for row in rows:
row_td = row.find_all('td')
print(row_td)
type(row_td)
^this code above is the same thing as
urls=['some list of urls you want to scrape']
table_dfs= [pd.read_html(url) for url in urls]
you can crawl through the urls you're looking for and apply that, and then if you want to do something with/to the tables you can just go:
for table in table_dfs:
table + 'the thing you want to do'
Note that the in-line for loop of table_dfs is in a list. That means that you might not be able to discern which url it came from if the scrape is big enough. Pieca seemed to have a solution that could be used to iterate the websites urls, and create a dictionary key. Note that this solution may not apply to every website.
url_list = {page_number:"https://www.vesselfinder.com/vessels?page=
{}&minDW=20000&maxDW=300000&type=4".format(page_number) for page_number
in list(range(1, 953))}
table_dfs={}
for url in range(1,len(url_list)):
table_dfs[url]= pd.read_html(url_list[url],header=hdr)

Categories