Pandas dataframe check if string is not in column to append - python

I have a pandas dataframe with one of the columns called artist. I would like to append a new row only if the new artist name is not in this column.
I tried but with no success:
if (all_data != name.all(axis = 0)):
all_data = all_data.append({'artist':str(name), 'netWorth':str(worth.strip())}, ignore_index = True)
This is all the code I have:
def get_webpage(i, url):
URL = url+str(i)
response = requests.get(URL)
return bs4.BeautifulSoup(response.text, 'html.parser')
COLUMNS = ['artist', 'netWorth']
all_data = pd.DataFrame(columns = COLUMNS)
def scrape(soup):
artists = soup.find_all('article', class_ = 'thumb-wrap')
for ar in artists:
name = ar.h3.a.text
worth = ar.div.find('div', class_='bc-networth').text
global all_data
if (all_data['artist'] != name).any():
all_data = all_data.append({'artist':str(name), 'netWorth':str(worth.strip())}, ignore_index = True)
i = 1
url = 'http://www.therichest.com/celebnetworth-category/celeb/singer/page/'
while (i<=14):
soup = get_webpage(i, url)
i = i+1
data = scrape(soup)
i = 1
url = 'http://www.therichest.com/celebnetworth-category/celeb/musician/page/'
while (i<=7):
soup = get_webpage(i, url)
i = i+1
data = scrape(soup)

I believe need check one column artist only:
if (all_data['artist'] != str(name)).all():
Sample:
all_data = pd.DataFrame({'netWorth':[5,3],
'artist':list('ab')})
print (all_data)
netWorth artist
0 5 a
1 3 b
name = 'a'
b = 10
if (all_data['artist'] != str(name)).all():
all_data = all_data.append({'artist':str(name), 'netWorth':b }, ignore_index = True)
print (all_data)
netWorth artist
0 5 a
1 3 b
name = 'd'
b = 10
if (all_data['artist'] != name).all():
all_data = all_data.append({'artist':str(name), 'netWorth':b }, ignore_index = True)
print (all_data)
netWorth artist
0 5 a
1 3 b
2 10 d

Related

Create merged df based on the url list [pandas]

I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.
You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

KeyError for column that exits in dataframe

This works if I remove the schedule but if i leave it in i receive a key error for 'Symbol'
def tweet_and_archive(sl):
ticker_l = []
name_l = []
price_l = []
price_out_l = []
date_time = []
for index, row in sl.iterrows():
Stock = row['Symbol']
Price = row['Price']
Price_out = row['Price Out']
name_ = row['Name']
Date_ = row['DateTime']
if ...
schedule.every().monday.at('12:31').do(lambda: tweet_and_archive(short_list))
while True:
schedule.run_pending()
time.sleep(1)
This is the short_list dataframe:
Symbol Name Price % Change Price Out DateTime
0 ANGPY Anglo American Platinum Limited 25.82 7.14 NaN 28/02/2022

Python function returning None when trying to display dataframe

I am trying to get a url from bbc recipe and then get the information and then put it into a dataframe. When I am trying to run the function I made The result I am getting is 'None' and I am unsure why because it worked before I tried to organise them into a function
columns_name=['title', 'total_time', 'image', 'ingredients', 'rating_val',
'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url']
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
def print_dataframe(df):
return df
def insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
df = df.append(new_row, ignore_index=True)
def collect_page_data(url,columns_name):
df = pd.DataFrame(columns = columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text,'html.parser')
res = page_soup.find("script", {"type":
"application/ld+json"})
data = json.loads(res.text)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount']
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i,"")
cookTime = cookTime.replace(i,"")
totalTime = int(prepTime) + int(cookTime)
insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df)
print_dataframe(df)
print(collect_page_data(url,columns_name))
You have problem with two return.
First:
In insert_df() you use
df = df.append(...)
which create local df inside insert_df() - it doesn't change external df
You should rather use return and use
return df.append(...)
and execute function as
df = insert_df()
Second():
At the end of collect_page_data() you run
print_dataframe(df)
which get df and only returns it back. It can't get it back from `collect_page_data()
At the end of collect_page_data() you should run
return df
And this is full code:
I my version of BeautifulSoup I had to use res.string instead of res.text to get text.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
return df.append(new_row, ignore_index=True)
def collect_page_data(url, columns_name):
df = pd.DataFrame(columns=columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')
res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount']
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i, "")
cookTime = cookTime.replace(i, "")
totalTime = int(prepTime) + int(cookTime)
df = insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df)
return df
# --- main ---
columns_name = [
'title', 'total_time', 'image', 'ingredients', 'rating_val',
'rating_count', 'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])
Result:
(I get only first row - so I have Series which displays data as column)
title NaN
total_time NaN
image [https://food-images.files.bbci.co.uk/food/rec...
ingredients NaN
rating_val NaN
rating_count 22
category NaN
cuisine NaN
diet http://schema.org/LowCalorieDiet
vegan http://schema.org/VeganDiet
vegetarian http://schema.org/VegetarianDiet
url https://www.bbc.co.uk/food/recipes/avocado_pas...
Category Main course
Ingredients [375g/13oz pasta, such as penne or fusilli, 1 ...
name Nadiya Hussain
rating_value 4.363636
totalTime 40.0
Name: 0, dtype: object
EDIT:
As for me insert_df() is totaly useless and you could run its code directly in collect_page_data(). It could create more readable code.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def collect_page_data(url, columns_name):
# --- scraping ---
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')
res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string) #
prep_time = data['prepTime']
cook_time = data['cookTime']
for char in ['P', 'T', 'M']:
prep_time = prep_time.replace(char, "")
cook_time = cook_time.replace(char, "")
total_time = int(prep_time) + int(cook_time)
# --- dataframe ---
df = pd.DataFrame(columns=columns_name)
df = df.append({
'name': data['author']['name'],
'total_time': total_time,
'image': data['image'],
'rating_count': data['aggregateRating']['ratingCount'],
'rating_value': data['aggregateRating']['ratingValue'],
'category': data['recipeCategory'],
'ingredients': data['recipeIngredient'],
'diet': data['suitableForDiet'][1],
'vegan': data['suitableForDiet'][2],
'vegetarian': data['suitableForDiet'][3],
'url': url
}, ignore_index=True)
return df
# --- main ---
columns_name = [
'title', 'name', 'total_time', 'image',
'ingredients', 'rating_value', 'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])

How to convert text table to dataframe

I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Categories