Loop through csv and pull from web - python

I could use some help. I am stuck. I want to read a .csv ("IMDb_id.csv"), it has one column (Imdb_link) that contains random IMDb tile URLs. I want it to iterate through the rows grab the URL and then scrape the URL for the title, image, genre, actor and director. I then want to download the image, save it with the name of the IMDb title and put the actor and director in one.csv. This code right now works and brings down one poster, puts the actor and director in its own .csv but doesn't iterate through the .csv or combine into one .csv.
print("Current Working Directory " , os.getcwd())
os.chdir('/Users/Desktop/PROJECT MOVIE ANALITICA')
df_IMDb_id_URL = pd.read_csv("IMDb_id.csv")
#print(df_IMDb_id_URL.head(3))
#df_IMDb_id =[]
for column,row in df_IMDb_id_URL.iteritems():
#print(index)
Movie_URL = row
print(Movie_URL)
r = requests.get('https://www.imdb.com/title/tt0037800')
r_unparsed = r.text
start = time.time()
Movie_IMDb_Page = BeautifulSoup(r_unparsed,'lxml')
end = time.time()
for index in Movie_IMDb_Page.find_all("script",type="application/ld+json"):
result_dictionary = json.loads(index.contents[0])
Image_URL = result_dictionary['image']
Movie_ID = result_dictionary['url']
Image_ID_Name = re.sub('/|title', '', Movie_ID)
title = Movie_IMDb_Page.title.text
description = Movie_IMDb_Page.find('div','summary_text').text.strip()
url_response = urllib.request.urlopen(Image_URL)
print(url_response)
# Python Requests Tutorial- Request Web Pages, Download Images, POST Data, Read JSON, and More.mp4
print(Image_URL) #Print URL
Image_URL_request = requests.get(Image_URL)
try:
os.mkdir(os.path.join(os.getcwd(), "POSTER"))
except:
pass
os.chdir(os.path.join(os.getcwd(), "POSTER"))
with open(Image_ID_Name + '.jpg', 'wb') as f:
f.write(Image_URL_request.content)
Actors_List =[]
Directors_List =[]
Creators_List =[]
Genre_List =[]
Movie_ID = result_dictionary['url']
Actors = result_dictionary['actor']
Directors = result_dictionary['director']
Creators = result_dictionary['creator']
Genres = result_dictionary['genre']
print(re.sub('/|title', '', Movie_ID))
print (Movie_ID)
#print (res_str_ID)
#print (Actors)
for index in Actors:
Actors_List.append(str(index[u'name']))
for index in Directors:
Directors_List.append(str(index[u'name']))
#for index in Creators:
# Creators_List.append(str(index[u'name']))
#Method PANDAS
df = pd.DataFrame(Actors_List)
#df.to_csv('Actors_List.csv')
df.to_csv('Actors_List.csv', index=False, header=False) # removes the headers from csv and saves
#Method PANDAS
df = pd.DataFrame(Directors_List)
#df.to_csv('Actors_List.csv')
df.to_csv('Directors_List.csv', index=False, header=False) # removes the headers from csv and saves
print(result_dictionary['contentRating'])
print(Genres)
print(Actors_List)
print(Directors_List)
print(Creators)

Related

Add sufix on duplicates in pandas dataframe Python

i am writing a script to download images.
I'm reading a excel file as a pandas dataframe
Column A -url links
Column B - Name
downloaded images will have this name, example "A.jpeg"
There will be duplicates in Column B[Name] in that case i would like to add a suffix on the image name.
so the output will be
A.jpeg
A-1.Jpeg
..
import requests
import pandas as pd
df = pd.read_excel(r'C:\Users\exdata1.xlsx')
for index, row in df.iterrows():
url = row['url']
file_name = url.split('/')
r = requests.get(url)
file_name=(row['name']+".jpeg")
if r.status_code == 200:
with open(file_name, "wb") as f:
f.write(r.content)
print (file_name)
I have been trying cumcount but can't really seem to get it to work..
Apreciate all the help I can get
You can try:
import requests
import pandas as pd
df = pd.read_excel(r"C:\Users\exdata1.xlsx")
cnt = {}
for index, row in df.iterrows():
name = row["name"]
if name not in cnt:
cnt[name] = 0
name = f"{name}.jpeg"
else:
cnt[name] += 1
name = f"{name}-{cnt[name]}.jpeg"
url = row["url"]
r = requests.get(url)
if r.status_code == 200:
with open(name, "wb") as f:
f.write(r.content)
print(name)
This will download the files as A.jpeg, A-1.jpeg, A-2.jpeg, ...

How to extract text from multiple pdf in a location with specific line and store in Excel?

I have 100 pdf stored in a location and I want to extract text from them and store in excel
below is pdf image
in this i want (stored in page1)
bid no,end date,item category,organisation name
needed
OEM Average Turnover (Last 3 Years),Years of Past Experience required,MSE Exemption for Years Of Experience
and Turnover,Startup Exemption for Years of Experience
and Turnover,Estimated Bid Value,EMD Required
Consignee address only)
Tika is one of the Python packages that you can use to extract the data from your PDF files.
In the example below I'm using Tika and regular expressions to extract these five data elements:
bid no
end date
item category
organisation name
total quantity
import re as regex
from tika import parser
parse_entire_pdf = parser.from_file('2022251527199.pdf', xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
print(bid_number.group(2))
GEM/2022/B/1916455
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
print(bid_end_date.group(2))
21-02-2022 15:00:00
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
print(org_name.group(2))
State Election Commission (sec), Gujarat
item_category = regex.search(r'(Item Category)\W(.*)', values)
print(item_category.group(2))
Desktop Computers (Q2) , Computer Printers (Q2)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
print(total_quantity.group(2))
18
Here is one way to write out the extracted data to a CSV file:
import csv
import re as regex
from tika import parser
document_elements = []
# processing 2 documents
documents = ['202225114747453.pdf', '2022251527199.pdf']
for doc in documents:
parse_entire_pdf = parser.from_file(doc, xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
item_category = regex.search(r'(Item Category)\W(.*)', values)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
document_elements.append([bid_number.group(2),
bid_end_date.group(2),
org_name.group(2),
item_category.group(2),
total_quantity.group(2)])
with open("out.csv", "w", newline="") as f:
headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
writer = csv.writer(f)
writer.writerow(headerList)
writer.writerows(document_elements)
Here is the additional code that you asked for in the comments.
import os
import re as regex
from tika import parser
document_elements = []
image_directory = "pdf_files"
image_directory_abspath = os.path.abspath(image_directory)
for dirpath, dirnames, filenames in os.walk(image_directory_abspath):
for filename in [f for f in filenames if f.endswith(".pdf")]:
parse_entire_pdf = parser.from_file(os.path.join(dirpath, filename), xmlContent=True)
for key, values in parse_entire_pdf.items():
if key == 'content':
bid_number = regex.search(r'(Bid Number:)\W(GEM\W\d{4}\W[A-Z]\W\d+)', values)
bid_end_date = regex.search(r'(Bid End Date\WTime)\W(\d{2}-\d{2}-\d{4}\W\d{2}:\d{2}:\d{2})', values)
org_name = regex.search(r'(Organisation Name)\W(.*)', values)
item_category = regex.search(r'(Item Category)\W(.*)', values)
total_quantity = regex.search(r'(Total Quantity)\W(\d+)', values)
document_elements.append([bid_number.group(2),
bid_end_date.group(2),
org_name.group(2),
item_category.group(2),
total_quantity.group(2)])
with open("out.csv", "w", newline="") as f:
headerList = ['bid_number', 'bid_end_date', 'org_name', 'item_category', 'total_quantity']
writer = csv.writer(f)
writer.writerow(headerList)
writer.writerows(document_elements)
SPECIAL NOTE: I noted that some PDFs don't have an org_name, so you will have to figure out how to handle these with either a N/A, None, or Null
If you want to extract data from pdf tables to excel, you can use tabula https://tabula.technology/. It's actually pretty good for this kind of thing.
The following code might help you get started:
pdf_folder = 'C:\\PDF extract\\pdf\\'
paths = [pdf_folder + fn for fn in os.listdir(pdf_folder) if fn.endswith('.pdf')]
for path in paths:
listdf = tabula.read_pdf(path, encoding = 'latin1', pages = 'all', nospreadsheet = True,multiple_tables=True)
path = path.replace('pdf', 'csv')
df_concat = pd.concat(listdf)
df_concat.to_csv(path, index = False)
sourced from: looping through pdf files with tabulizer in python

How to iterate list of urls using request.get?

I have such code:
url = "https://www.reformagkh.ru/opendata/export/"
regions = ["150", "101"]
csv_files = []
for region in regions:
result = requests.get(url, params={"t":region})
zf = ZipFile(BytesIO(result.content))
for filename in zf.namelist():
if filename.endswith(".csv"):
file = zf.open(filename)
csv_files.append(file)
if len(csv_files) == 1:
reader = csv.reader(TextIOWrapper(file, 'utf-8'))
for row in reader:
print(row)
else:
print("Error")
I have 2 links, where located some unzip csv files and I should open them and read. The main question is how work with list of urls and open them step by step?
When I am trying to debug and fix it, I have 400 error and problem with loop. Could somebody give me advise how to handle it?
I should open and handle such links:
['https://www.reformagkh.ru/opendata/export/150',
'https://www.reformagkh.ru/opendata/export/101']
You need to prepare the url in the loop instead of passing region as params.
Use f-strings to prepare the url as for Python 3.6+:
for region in regions:
url_cur = f"{url}{region}"
result = requests.get(url_cur)
Use format() if you are using python version less than 3.6:
for region in regions:
url_cur = "{}{}".format(url, region)
result = requests.get(url_cur)
You also need to create the csv_files list newly for each url.
The complete code would be:
url = "https://www.reformagkh.ru/opendata/export/"
regions = ["150", "101"]
for region in regions:
cur_url = f"{url}{region}"
result = requests.get(cur_url)
zf = ZipFile(BytesIO(result.content))
csv_files = [] # create a new list everytime
for filename in zf.namelist():
if filename.endswith(".csv"):
file = zf.open(filename)
csv_files.append(file)
if len(csv_files) == 1:
reader = csv.reader(TextIOWrapper(file, 'utf-8'))
for row in reader:
print(row)
else:
print("Error")
regions = ["150", "101"]
csv_files = []
for region in regions:
url = "https://www.reformagkh.ru/opendata/export/%s" % region
result = requests.get(url)
zf = ZipFile(BytesIO(result.content))
for filename in zf.namelist():
if filename.endswith(".csv"):
file = zf.open(filename)
csv_files.append(file)
if len(csv_files) == 1:
reader = csv.reader(TextIOWrapper(file, 'utf-8'))
for row in reader:
print(row)
else:
print("Error")
I think it is much easier with %s. I often use the same method.

Python nested for loop ordering

i am having issues get a nested for loop to output individual csv files for an API call. The API call is paginated, so we have to query the API multiple times and append the data Also have to loop through for every exchange.
The way the code is now it's only outputting the last page of data for a couple of exchanges and the the following exchanges just have 'name' in the CSV, no other data...
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
import pandas as pd
import time
##grab a list of all the exchangeslisted on CG
ex_list = cg.get_exchanges_list()
#normalise the json
df = pd.json_normalize(ex_list)
#output to csv
#df.to_csv('exchange_list.csv', encoding='utf-8', index=False)
#make a list with just one column
id_list = df['id'].to_list()
def read_exchange_tickers():
for x in id_list:
for i in range(1,10):
appended_data = []
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
#time.sleep(10)
#define path + filename
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)
read_exchange_tickers()
You should collect all data for each id and then save the data to file.
def read_exchange_tickers():
for x in id_list:
appended_data = []
# collect all the data for current id
for i in range(1,10):
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
# save the data to csv
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)

Python - Web Scraping - BeautifulSoup & CSV

I am hoping to extract the change in cost of living from one city against many cities. I plan to list the cities I would like to compare in a CSV file and using this list to create the web link that would take me to the website with the information I am looking for.
Here is the link to an example: http://www.expatistan.com/cost-of-living/comparison/phoenix/new-york-city
Unfortunately I am running into several challenges. Any assistance to the following challenges is greatly appreciated!
The output only shows the percentage, but no indication whether it is more expensive or cheaper. For the example listed above, my output based on the current code shows 48%, 129%, 63%, 43%, 42%, and 42%. I tried to correct for this by adding an 'if-statement' to add '+' sign if it is more expensive, or a '-' sign if it is cheaper. However, this 'if-statement' is not functioning correctly.
When I write the data to a CSV file, each of the percentages is written to a new row. I can't seem to figure out how to write it as a list on one line.
(related to item 2) When I write the data to a CSV file for the example listed above, the data is written in the format listed below. How can I correct the format and have the data written in the preferred format listed below (also without the percentage sign)?
CURRENT CSV FORMAT (Note: 'if-statement' not functioning correctly):
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,8,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,1,2,9,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,6,3,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,3,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,2,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,2,%
PREFERRED CSV FORMAT:
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
new-york-city, 48,129,63,43,42,42
Here is my current code:
import requests
import csv
from bs4 import BeautifulSoup
#Read text file
Textfile = open("City.txt")
Textfilelist = Textfile.read()
Textfilelistsplit = Textfilelist.split("\n")
HomeCity = 'Phoenix'
i=0
while i<len(Textfilelistsplit):
url = "http://www.expatistan.com/cost-of-living/comparison/" + HomeCity + "/" + Textfilelistsplit[i]
page = requests.get(url).text
soup_expatistan = BeautifulSoup(page)
#Prepare CSV writer.
WriteResultsFile = csv.writer(open("Expatistan.csv","w"))
WriteResultsFile.writerow(["City","Food","Housing","Clothes","Transportation","Personal Care", "Entertainment"])
expatistan_table = soup_expatistan.find("table",class_="comparison")
expatistan_titles = expatistan_table.find_all("tr",class_="expandable")
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th",class_="percent")
percent_difference_title = percent_difference.span['class']
if percent_difference_title == "expensiver":
WriteResultsFile.writerow(Textfilelistsplit[i] + '+' + percent_difference.span.string)
else:
WriteResultsFile.writerow(Textfilelistsplit[i] + '-' + percent_difference.span.string)
i+=1
Answers:
Question 1: the class of the span is a list, you need to check if expensiver is inside this list. In other words, replace:
if percent_difference_title == "expensiver"
with:
if "expensiver" in percent_difference.span['class']
Questions 2 and 3: you need to pass a list of column values to writerow(), not string. And, since you want only one record per city, call writerow() outside of the loop (over the trs).
Other issues:
open csv file for writing before the loop
use with context managers while working with files
try to follow PEP8 style guide
Here's the code with modifications:
import requests
import csv
from bs4 import BeautifulSoup
BASE_URL = 'http://www.expatistan.com/cost-of-living/comparison/{home_city}/{city}'
home_city = 'Phoenix'
with open('City.txt') as input_file:
with open("Expatistan.csv", "w") as output_file:
writer = csv.writer(output_file)
writer.writerow(["City", "Food", "Housing", "Clothes", "Transportation", "Personal Care", "Entertainment"])
for line in input_file:
city = line.strip()
url = BASE_URL.format(home_city=home_city, city=city)
soup = BeautifulSoup(requests.get(url).text)
table = soup.find("table", class_="comparison")
differences = []
for title in table.find_all("tr", class_="expandable"):
percent_difference = title.find("th", class_="percent")
if "expensiver" in percent_difference.span['class']:
differences.append('+' + percent_difference.span.string)
else:
differences.append('-' + percent_difference.span.string)
writer.writerow([city] + differences)
For the City.txt containing just one new-york-city line, it produces Expatistan.csv with the following content:
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
new-york-city,+48%,+129%,+63%,+43%,+42%,+42%
Make sure you understand what changes have I made. Let me know if you need further help.
csv.writer.writerow() takes a sequence and makes each element a column; normally you'd give it a list with columns, but you are passing in strings instead; that'll add individual characters as columns instead.
Just build a list, then write it to the CSV file.
First, open the CSV file once, not for every separate city; you are clearing out the file every time you open it.
import requests
import csv
from bs4 import BeautifulSoup
HomeCity = 'Phoenix'
with open("City.txt") as cities, open("Expatistan.csv", "wb") as outfile:
writer = csv.writer(outfile)
writer.writerow(["City", "Food", "Housing", "Clothes",
"Transportation", "Personal Care", "Entertainment"])
for line in cities:
city = line.strip()
url = "http://www.expatistan.com/cost-of-living/comparison/{}/{}".format(
HomeCity, city)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, from_encoding=resp.encoding)
titles = soup.select("table.comparison tr.expandable")
row = [city]
for title in titles:
percent_difference = title.find("th", class_="percent")
changeclass = percent_difference.span['class']
change = percent_difference.span.string
if "expensiver" in changeclass:
change = '+' + change
else:
change = '-' + change
row.append(change)
writer.writerow(row)
So, first of all, one passes the writerow method an iterable, and each object in that iterable gets written with commas separating them. So if you give it a string, then each character gets separated:
WriteResultsFile.writerow('hello there')
writes
h,e,l,l,o, ,t,h,e,r,e
But
WriteResultsFile.writerow(['hello', 'there'])
writes
hello,there
That's why you are getting results like
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,8,%
The rest of your problems are errors in your webscraping. First of all, when I scrape the site, searching for tables with CSS class "comparison" gives me None. So I had to use
expatistan_table = soup_expatistan.find("table","comparison")
Now, the reason your "if statement is broken" is because
percent_difference.span['class']
returns a list. If we modify that to
percent_difference.span['class'][0]
things will work the way you expect.
Now, your real issue is that inside the innermost loop you are finding the % changing in price for the individual items. You want these as items in your row of price differences, not individual rows. So, I declare an empty list items to which I append percent_difference.span.string, and then write the row outside the innermost loop Like so:
items = []
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th","percent")
percent_difference_title = percent_difference.span["class"][0]
print percent_difference_title
if percent_difference_title == "expensiver":
items.append('+' + percent_difference.span.string)
else:
items.append('-' + percent_difference.span.string)
row = [Textfilelistsplit[i]]
row.extend(items)
WriteResultsFile.writerow(row)
The final error, is the in the while loop you re-open the csv file, and overwrite everything so you only have the final city in the end. Accounting for all theses errors (many of which you should have been able to find without help) leaves us with:
#Prepare CSV writer.
WriteResultsFile = csv.writer(open("Expatistan.csv","w"))
i=0
while i<len(Textfilelistsplit):
url = "http://www.expatistan.com/cost-of-living/comparison/" + HomeCity + "/" + Textfilelistsplit[i]
page = requests.get(url).text
print url
soup_expatistan = BeautifulSoup(page)
WriteResultsFile.writerow(["City","Food","Housing","Clothes","Transportation","Personal Care", "Entertainment"])
expatistan_table = soup_expatistan.find("table","comparison")
expatistan_titles = expatistan_table.find_all("tr","expandable")
items = []
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th","percent")
percent_difference_title = percent_difference.span["class"][0]
print percent_difference_title
if percent_difference_title == "expensiver":
items.append('+' + percent_difference.span.string)
else:
items.append('-' + percent_difference.span.string)
row = [Textfilelistsplit[i]]
row.extend(items)
WriteResultsFile.writerow(row)
i+=1
YAA - Yet Another Answer.
Unlike the other answers, this treats the data as a series key-value pairs; ie: a list of dictionaries, which are then written to CSV. A list of wanted fields is provided to the csv writer (DictWriter), which discards additional information (beyond the specified fields) and blanks missing information. Also, should the order of the information on the original page change, this solution is unaffected.
I also assume you are going to open the CSV file in something like Excel. Additional parameters need to be given to the csv writer for this to happen nicely (see dialect parameter). Given that we are not sanitising the returned data, we should explicitly delimit it with unconditional quoting (see quoting parameter).
import csv
import requests
from bs4 import BeautifulSoup
#Read text file
with open("City.txt") as cities_h:
cities = cities_h.readlines()
home_city = "Phoenix"
city_data = []
for city in cities:
url = "http://www.expatistan.com/cost-of-living/comparison/%s/%s" % (home_city, city)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, from_encoding = resp.encoding)
titles = soup.select("table.comparison tr.expandable")
if titles:
data = {}
for title in titles:
name = title.find("th", class_ = "clickable")
diff = title.find("th", class_ = "percent")
exp = bool(diff.find("span", class_ = "expensiver"))
data[name.text] = ("+" if exp else "-") + diff.span.text
data["City"] = soup.find("strong", class_ = "city-2").text
city_data.append(data)
with open("Expatistan.csv","w") as csv_h:
fields = \
[
"City",
"Food",
"Housing",
"Clothes",
"Transportation",
"Personal Care",
"Entertainment"
]
#Prepare CSV writer.
writer = csv.DictWriter\
(
csv_h,
fields,
quoting = csv.QUOTE_ALL,
extrasaction = "ignore",
dialect = "excel",
lineterminator = "\n",
)
writer.writeheader()
writer.writerows(city_data)

Categories