Append Rows issue while scraping URLs via Python loop - python

I'm looking to visit each URL and return every player image found within the HREF tags, meaning - visit URL, click each player, store profile image link.
I had the right result printing with the code below, but it was pushing the data one by one & ultimately hitting a 429 G Spread quota issue.
My full code is here:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
# def get_links(url):
# data = []
# req_url = requests.get(url)
# soup = BeautifulSoup(req_url.content, "html.parser")
# for td in soup.select('td:has(>a[href^="/player"])'):
# a_tag = td.a
# name = a_tag.text
# player_url = a_tag['href']
# print(f"Getting {name}")
# req_player_url = requests.get(
# f"https://basketball.realgm.com{player_url}")
# soup_player = BeautifulSoup(req_player_url.content, "html.parser")
# print(f"soup_player for {name}: {soup_player}")
# div_profile_box = soup_player.find('div', {'class': 'profile-box'})
# img_tags = div_profile_box.find_all('img')
# for i, img_tag in enumerate(img_tags):
# image_url = img_tag['src']
# row = {"Name": name, "URL": player_url,
# f"Image URL {i}": image_url}
# data.append(row)
# return data
def get_links2(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.select('td.nowrap'):
a_tag = td.a
if a_tag:
name = a_tag.text
player_url = a_tag['href']
pos = td.find_next_sibling('td').text
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name": name, "URL": player_url, "pos_option1": pos}
row['pos_option2'] = div_profile_box.h2.span.text if div_profile_box.h2.span else None
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
# Add img tags to row dictionary
img_tags = div_profile_box.find_all('img')
for i, img in enumerate(img_tags):
row[f'img_{i+1}'] = img['src']
data.append(row)
return data
urls = ["https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2", "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/4"]
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/5",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/6",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/7",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/8",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/9",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/10",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/11",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/12",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/13",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/14",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/15",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/16",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/17",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/18",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/19",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/20",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/21",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/22",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/23",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/24",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/25",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/26",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/27",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/28",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/29",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/30",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/31",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/32",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/33",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/34",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/35",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/36",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/37",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/38",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/39",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/40",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/41",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/42",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/43",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/44",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/45",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/46",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/47",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/48",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/49",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/50",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/51",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/52",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/53",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/54",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/55",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/56",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/57",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/58",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/59",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/60",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/61",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/62",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/63",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/64",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/65",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/66",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/67",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/68",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/69",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/70",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/71",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/72",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/73",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/74",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/75",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/76",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/77",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/78",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/79",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/80",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/81",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/82",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/83",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/84",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/85",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/86",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/87",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/88",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/89",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/90",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/91",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/92",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/93",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/94",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/95",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/96"]
for url in urls:
data = get_links2(url)
for row in data:
worksheet.insert_row(list(row.values()))
I tried to switch to "append_rows" instead of "insert_row" in my final statement. This created a very confusing error:
Traceback (most recent call last):
File "c:\Users\AMadle\GLeague Tracking\(A) INTLimgScrape.py", line 175, in <module>
worksheet.append_rows(list(row.values()))
File "C:\Python\python3.10.5\lib\site-packages\gspread\worksheet.py", line 1338, in append_rows
return self.spreadsheet.values_append(range_label, params, body)
File "C:\Python\python3.10.5\lib\site-packages\gspread\spreadsheet.py", line 149, in values_append
r = self.client.request("post", url, params=params, json=body)
File "C:\Python\python3.10.5\lib\site-packages\gspread\client.py", line 86, in request
raise APIError(response)
gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid value at \'data.values[0]\' (type.googleapis.com/google.protobuf.ListValue), "Jaroslaw Zyskowski"\nInvalid value at \'data.values[1]\' (type.googleapis.com/google.protobuf.ListValue), "/player/Jaroslaw-Zyskowski/Summary/32427"\nInvalid value at \'data.values[2]\' (type.googleapis.com/google.protobuf.ListValue), "TRE"\nInvalid value at \'data.values[3]\' (type.googleapis.com/google.protobuf.ListValue), "SF"\nInvalid value at \'data.values[4]\' (type.googleapis.com/google.protobuf.ListValue), "Trefl Sopot"\nInvalid value at \'data.values[5]\' (type.googleapis.com/google.protobuf.ListValue), "Jul 16, 1992(30 years old)"\nInvalid value at \'data.values[6]\' (type.googleapis.com/google.protobuf.ListValue), "Wroclaw, Poland"\nInvalid value at \'data.values[7]\' (type.googleapis.com/google.protobuf.ListValue), "Poland"\nInvalid value at \'data.values[8]\' (type.googleapis.com/google.protobuf.ListValue), "6-7 (201cm)Weight:220 (100kg)"\nInvalid value at \'data.values[9]\' (type.googleapis.com/google.protobuf.ListValue), "Unrestricted Free Agent"\nInvalid value at \'data.values[10]\' (type.googleapis.com/google.protobuf.ListValue), "Manuel Capicchioni"\nInvalid value at \'data.values[11]\' (type.googleapis.com/google.protobuf.ListValue), "2014 NBA Draft"\nInvalid value at \'data.values[12]\' (type.googleapis.com/google.protobuf.ListValue), "Undrafted"\nInvalid value at \'data.values[13]\' (type.googleapis.com/google.protobuf.ListValue), "Kotwica Kolobrzeg (Poland)"\nInvalid value at \'data.values[14]\' (type.googleapis.com/google.protobuf.ListValue), "/images/nba/4.2/profiles/photos/2006/player_photo.jpg"\nInvalid value at \'data.values[15]\' (type.googleapis.com/google.protobuf.ListValue), "/images/basketball/5.0/team_logos/international/polish/trefl.png"', 'status': 'INVALID_ARGUMENT', 'details': [{'#type': 'type.googleapis.com/google.rpc.BadRequest', 'fieldViolations': [{'field': 'data.values[0]', 'description': 'Invalid value at \'data.values[0]\' (type.googleapis.com/google.protobuf.ListValue), "Jaroslaw Zyskowski"'}, {'field': 'data.values[1]', 'description': 'Invalid value at \'data.values[1]\' (type.googleapis.com/google.protobuf.ListValue), "/player/Jaroslaw-Zyskowski/Summary/32427"'}, {'field': 'data.values[2]', 'description': 'Invalid value at \'data.values[2]\' (type.googleapis.com/google.protobuf.ListValue), "TRE"'}, {'field': 'data.values[3]', 'description': 'Invalid value at \'data.values[3]\' (type.googleapis.com/google.protobuf.ListValue), "SF"'}, {'field': 'data.values[4]', 'description': 'Invalid value at \'data.values[4]\' (type.googleapis.com/google.protobuf.ListValue), "Trefl Sopot"'}, {'field': 'data.values[5]', 'description': 'Invalid value at \'data.values[5]\' (type.googleapis.com/google.protobuf.ListValue), "Jul 16, 1992(30 years old)"'}, {'field': 'data.values[6]', 'description': 'Invalid value at \'data.values[6]\' (type.googleapis.com/google.protobuf.ListValue), "Wroclaw, Poland"'}, {'field': 'data.values[7]', 'description': 'Invalid value at \'data.values[7]\' (type.googleapis.com/google.protobuf.ListValue), "Poland"'}, {'field': 'data.values[8]', 'description': 'Invalid value at \'data.values[8]\' (type.googleapis.com/google.protobuf.ListValue), "6-7 (201cm)Weight:220 (100kg)"'}, {'field': 'data.values[9]', 'description': 'Invalid value at \'data.values[9]\' (type.googleapis.com/google.protobuf.ListValue), "Unrestricted Free Agent"'}, {'field': 'data.values[10]', 'description': 'Invalid value at \'data.values[10]\' (type.googleapis.com/google.protobuf.ListValue), "Manuel Capicchioni"'}, {'field': 'data.values[11]', 'description': 'Invalid value at \'data.values[11]\' (type.googleapis.com/google.protobuf.ListValue), "2014 NBA Draft"'}, {'field': 'data.values[12]', 'description': 'Invalid value at \'data.values[12]\' (type.googleapis.com/google.protobuf.ListValue), "Undrafted"'}, {'field': 'data.values[13]', 'description': 'Invalid value at \'data.values[13]\' (type.googleapis.com/google.protobuf.ListValue), "Kotwica Kolobrzeg (Poland)"'}, {'field': 'data.values[14]', 'description': 'Invalid value at \'data.values[14]\' (type.googleapis.com/google.protobuf.ListValue), "/images/nba/4.2/profiles/photos/2006/player_photo.jpg"'}, {'field': 'data.values[15]', 'description': 'Invalid value at \'data.values[15]\'
(type.googleapis.com/google.protobuf.ListValue), "/images/basketball/5.0/team_logos/international/polish/trefl.png"'}]}]}
PS C:\Users\AMadle\GLeague Tracking>
Any thoughts as to how I could push this output to my Google Sheet in one move, rather than inserting rows each time?

In your showing script, worksheet.insert_row(list(row.values())) is used in a loop. I thought that this might be the reason for your current issue. In this case, how about the following modification?
From:
for url in urls:
data = get_links2(url)
for row in data:
worksheet.insert_row(list(row.values()))
To:
values = []
for url in urls:
values = [*values, *get_links2(url)]
if values != []:
header = list(values[0].keys())
values = [header, *[[e[k] if e.get(k) else "" for k in header] for e in values]]
worksheet.append_rows(values, value_input_option="USER_ENTERED")
By this modification, after all values were retrieved in for url in urls:, the values are put into the Spreadsheet. This flow can be achieved by one API call. I thought that by this, your current issue might be able to be removed.
If you don't want to include the header row, please modify [header, *[[e[k] if e.get(k) else "" for k in header] for e in values]] to [[e[k] if e.get(k) else "" for k in header] for e in values].
If you want to put the value of every URL, how about the following modification? But, in this case, the Sheets API for the number of URLs is used.
From
for url in urls:
data = get_links2(url)
for row in data:
worksheet.insert_row(list(row.values()))
To
header = None
for url in urls:
values = get_links2(url)
if values != []:
if not header:
header = list(values[0].keys())
values = [[e[k] if e.get(k) else "" for k in header] for e in values]
worksheet.append_rows(values, value_input_option="USER_ENTERED")

Related

How to extract information from atom feed based on condition?

I have output of API request in given below.
From each atom:entry I need to extract
<c:series href="http://company.com/series/product/123"/>
<c:series-order>2020-09-17T00:00:00Z</c:series-order>
<f:assessment-low precision="0">980</f:assessment-low>
I tried to extract them to different list with BeautifulSoup, but that wasn't successful because in some entries there are dates but there isn't price (I've shown example below). How could I conditionally extract it? at least put N/A for entries where price is ommited.
soup = BeautifulSoup(request.text, "html.parser")
date = soup.find_all('c:series-order')
value = soup.find_all('f:assessment-low')
quot=soup.find_all('c:series')
p_day = []
p_val = []
q_val=[]
for i in date:
p_day.append(i.text)
for j in value:
p_val.append(j.text)
for j in quot:
q_val.append(j.get('href'))
d2={'date': p_day,
'price': p_val,
'quote': q_val
}
and
<atom:feed xmlns:atom="http://www.w3.org/2005/Atom" xmlns:a="http://company.com/ns/assets" xmlns:c="http://company.com/ns/core" xmlns:f="http://company.com/ns/fields" xmlns:s="http://company.com/ns/search">
<atom:id>http://company.com/search</atom:id>
<atom:title> COMPANYSearch Results</atom:title>
<atom:updated>2022-11-24T19:36:19.104414Z</atom:updated>
<atom:author>COMPANY atom:author>
<atom:generator> COMPANY/search Endpoint</atom:generator>
<atom:link href="/search" rel="self" type="application/atom"/>
<s:first-result>1</s:first-result>
<s:max-results>15500</s:max-results>
<s:selected-count>212</s:selected-count>
<s:returned-count>212</s:returned-count>
<s:query-time>PT0.036179S</s:query-time>
<s:request version="1.0">
<s:scope>
<s:series>http://company.com/series/product/123</s:series>
</s:scope>
<s:constraints>
<s:compare field="c:series-order" op="ge" value="2018-10-01"/>
<s:compare field="c:series-order" op="le" value="2022-11-18"/>
</s:constraints>
<s:options>
<s:first-result>1</s:first-result>
<s:max-results>15500</s:max-results>
<s:order-by key="commodity-name" direction="ascending" xml:lang="en"/>
<s:no-currency-rate-scheme>no-element</s:no-currency-rate-scheme>
<s:precision>embed</s:precision>
<s:include-last-commit-time>false</s:include-last-commit-time>
<s:include-result-types>live</s:include-result-types>
<s:relevance-score algorithm="score-logtfidf"/>
<s:lang-data-missing-scheme>show-available-language-content</s:lang-data-missing-scheme>
</s:options>
</s:request>
<s:facets/>
<atom:entry>
<atom:title>http://company.com/series-item/product/123-pricehistory-20200917000000</atom:title>
<atom:id>http://company.com/series-item/product/123-pricehistory-20200917000000</atom:id>
<atom:updated>2020-09-17T17:09:43.55243Z</atom:updated>
<atom:relevance-score>60800</atom:relevance-score>
<atom:content type="application/vnd.icis.iddn.entity+xml"><a:price-range>
<c:id>http://company.com/series-item/product/123-pricehistory-20200917000000</c:id>
<c:version>1</c:version>
<c:type>series-item</c:type>
<c:created-on>2020-09-17T17:09:43.55243Z</c:created-on>
<c:descriptor href="http://company.com/descriptor/price-range"/>
<c:domain href="http://company.com/domain/product"/>
<c:released-on>2020-09-17T21:30:00Z</c:released-on>
<c:series href="http://company.com/series/product/123"/>
<c:series-order>2020-09-17T00:00:00Z</c:series-order>
<f:assessment-low precision="0">980</f:assessment-low>
<f:assessment-high precision="0">1020</f:assessment-high>
<f:mid precision="1">1000</f:mid>
<f:assessment-low-delta>0</f:assessment-low-delta>
<f:assessment-high-delta>+20</f:assessment-high-delta>
<f:delta-type href="http://company.com/ref-data/delta-type/regular"/>
</a:price-range></atom:content>
</atom:entry>
<atom:entry>
<atom:title>http://company.com/series-item/product/123-pricehistory-20200910000000</atom:title>
<atom:id>http://company.com/series-item/product/123-pricehistory-20200910000000</atom:id>
<atom:updated>2020-09-10T18:57:55.128308Z</atom:updated>
<atom:relevance-score>60800</atom:relevance-score>
<atom:content type="application/vnd.icis.iddn.entity+xml"><a:price-range>
<c:id>http://company.com/series-item/product/123-pricehistory-20200910000000</c:id>
<c:version>1</c:version>
<c:type>series-item</c:type>
<c:created-on>2020-09-10T18:57:55.128308Z</c:created-on>
<c:descriptor href="http://company.com/descriptor/price-range"/>
<c:domain href="http://company.com/domain/product"/>
<c:released-on>2020-09-10T21:30:00Z</c:released-on>
<c:series href="http://company.com/series/product/123"/>
<c:series-order>2020-09-10T00:00:00Z</c:series-order>
for example here is no price
<f:delta-type href="http://company.com/ref-data/delta-type/regular"/>
</a:price-range></atom:content>
</atom:entry>
May try to iterate per entry, use xml parser to get a propper result and check if element exists:
soup = BeautifulSoup(request.text,'xml')
data = []
for i in soup.select('entry'):
data.append({
'date':i.find('series-order').text,
'value': i.find('assessment-low').text if i.find('assessment-low') else None,
'quot': i.find('series').get('href')
})
data
or with html.parser:
soup = BeautifulSoup(xml,'html.parser')
data = []
for i in soup.find_all('atom:entry'):
data.append({
'date':i.find('c:series-order').text,
'value': i.find('f:assessment-low').text if i.find('assessment-low') else None,
'quot': i.find('c:series').get('href')
})
data
Output:
[{'date': '2020-09-17T00:00:00Z',
'value': '980',
'quot': 'http://company.com/series/product/123'},
{'date': '2020-09-10T00:00:00Z',
'value': None,
'quot': 'http://company.com/series/product/123'}]
You can try this:
split your request.text by <atom:entry>
deal with each section seperately.
Use enumerate to identify the section that it came from
entries = request.text.split("<atom:entry>")
p_day = []
p_val = []
q_val=[]
for i, entry in enumerate(entries):
soup = BeautifulSoup(entry, "html.parser")
date = soup.find_all('c:series-order')
value = soup.find_all('f:assessment-low')
quot=soup.find_all('c:series')
for d in date:
p_day.append([i, d.text])
for v in value:
p_val.append([i, v.text])
for q in quot:
q_val.append([i, q.get('href')])
d2={'date': p_day,
'price': p_val,
'quote': q_val
}
print(d2)
OUTPUT:
{'date': [[1, '2020-09-17T00:00:00Z'], [2, '2020-09-10T00:00:00Z']],
'price': [[1, '980']],
'quote': [[1, 'http://company.com/series/product/123'],
[2, 'http://company.com/series/product/123']]}

How to extract text and save as excel file using python or JavaScript

How do I extract text from this PDF files where some data is in the form of table while some are key value based data
eg:
https://drive.internxt.com/s/file/78f2d73478b832b2ab55/3edb275967deeca6ad33e7d53f2337c50d5dfb50e0aa525bb7f10d49dff1e2b4
This is what I have tried :
import PyPDF2
import openpyxl
from openpyxl import Workbook
pdfFileObj = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = Workbook()
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('sample.xlsx')
print('Save')
However I'd like the data to be stored in the following format.
This pdf does not have well defined tables, hence cannot use any tool to extract the entire data in one table format. What we can do is read the entire pdf as text. And process each data fields line by line by using regex to extract the data.
Before you move ahead, please install the pdfplumber package for python
pip install pdfplumber
Assumptions
Here are some assumptions that I made for your pdf and accordingly I have written the code.
First line will always contain the title Account History Report.
Second line will contain the names IMAGE All Notes
Third line will contain only the data Date Created in the form of key:value.
Fourth line will contain only the data Number of Pages in the form of key:value.
Fifth line will only contain the data Client Code, Client Name
Starting line 6, a pdf can have multiple data entity, these data entity for eg in this pdf is 2 but can be any number of entity.
Each data entity will contain the following fields:
First line in data entity will contain only the data Our Ref, Name, Ref 1, Ref 2
Second line line will only contain data in the form as present in pdf Amount, Total Paid, Balance, Date of A/C, Date Received
Third line in data entity will contain the data Last Paid, Amt Last Paid, Status, Collector.
Fourth line will contain the column name Date Notes
The subsequent lines will contain data in the form of table until the next data entity is started.
I also assume that each data entity will contain the first data with key Our Ref :.
I assume that the data entity will be separated on the first line of each entity in the pattern of key values as Our Ref :Value Name: Value Ref 1 :Value Ref 2:value
pattern = r'Our Ref.*?Name.*?Ref 1.*?Ref 2.*?'
Please note that the rectangle that I have created(thick black) in above image, I am calling those as data entity.
The final data will be stored in a dictionary(json) where the data entity will have key as dataentity1, dataentity2, dataentity3 based on the number of entities you have in your pdf.
The header details are stored in the json as key:value and I assume that each key will be present in header only once.
CODE
Here is the simple elegant code, that gives you information from the pdf in the form of json. In the output the first few field contains information from the header part, subsequent data entities can be found as data_entity 1 and 2.
In the below code all you need to change is pdf_path.
import pdfplumber
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
def preprocess_data(data):
return [el.strip() for el in data.splitlines() if el.strip()]
def get_header_data(text, json_data = {}):
header_data_list = preprocess_data(text)
# third line in text of header contains Date Created field
json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def iterate_through_regex_and_populate_dictionaries(data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def populate_date_notes(data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
data_index = 1
json_data = {}
pdf_path = r'C:\Users\hpoddar\Desktop\Temp\sample3.pdf' # ENTER YOUR PDF PATH HERE
pdf_text = ''
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
if(__name__ == '__main__'):
with pdfplumber.open(pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(data_entity_sep_pattern, pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
get_header_data(split_on_data_entity[0], json_data)
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = preprocess_data(split_on_data_entity[data_index])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
populate_date_notes(data_entity, data_processed)
json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
print(json_data)
Output :
Result string :
{'Date Created': '18/04/2022', 'Number of Pages': '4', 'Client Code': '110203', 'ClientName': 'AWS PTE. LTD.', 'data_entity1': {'Our Ref': '2118881115', 'Name': 'Sky Blue', 'Ref 1': '12-34-56789-2021/2', 'Ref 2': 'F2021004444', 'Amount': '$100.11', 'Total Paid': '$0.00', 'Balance': '$100.11', 'Date of A/C': '01/08/2021', 'Date Received': '10/12/2021', 'Last Paid': '', 'Amt Last Paid': '', 'A/C Status': 'CLOSED', 'Collector': 'Sunny Jane', 'Date': ['04/03/2022'], 'Notes': ['Letter Dated 04 Mar 2022.']}, 'data_entity2': {'Our Ref': '2112221119', 'Name': 'Green Field', 'Ref 1': '98-76-54321-2021/1', 'Ref 2': 'F2021001111', 'Amount': '$233.88', 'Total Paid': '$0.00', 'Balance': '$233.88', 'Date of A/C': '01/08/2021', 'Date Received': '10/12/2021', 'Last Paid': '', 'Amt Last Paid': '', 'A/C Status': 'CURRENT', 'Collector': 'Sam Jason', 'Date': ['11/03/2022', '11/03/2022', '08/03/2022', '08/03/2022', '21/02/2022', '18/02/2022', '18/02/2022'], 'Notes': ['Email for payment', 'Case Status', 'to send a Letter', '845***Ringing, No reply', 'Letter printed - LET: LETTER 2', 'Letter sent - LET: LETTER 2', '845***Line busy']}}
Now once you got the data in the json format, you can load it in a csv file, as a data frame or whatever format you need the data to be in.
Save as xlsx
To save the same in a xlsx file in the format as shown in the image in the question above. We can use xlsx writer to do the same.
Please install the package using pip
pip install xlsxwriter
From the previous code, we have our entire data in the variable json_data, we will be iterating through all the data entities and write the data to appropriate cell specified by row, col in the code.
import xlsxwriter
workbook = xlsxwriter.Workbook('Sample.xlsx')
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write columns
columns = ['Account History Report', 'All Notes'] + [ key for key in json_data.keys() if 'data_entity' not in key ] + list(json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(json_data.get(data_entity_key) != None):
for key, value in json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
Result :
The above code creates a file sample.xlsx in the working directory.

New Pandas Series longer than original dataset?

So I have a data set with user, date, and post columns. I'm trying to generate a column of the calories that foods contain in the post column for each user. This dataset has a length of 21, and the code below finds the food words, get their calorie value, append it to that user's respective calorie list, and append that list to the new column. The new generated column, however, somehow has a length of 25:
Current data: 21
New column: 25
Does anybody know why this occurs? Here is the code below and samples of what the original dataset and the new column look like:
while len(col) < len(data['post']):
for post, api_id, api_key in zip(data['post'], ids_keys.keys(), ids_keys.values()): # cycles through text data & api keys
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'x-app-id': api_id,
'x-app-key': api_key,
'x-remote-user-id': '0'
}
calories = []
print('Current data:', len(data['post']), '\n New column: ', len(col)) # prints length of post vs new cal column
for word in eval(post):
if word not in food:
continue
else:
print('Detected Word: ', word)
query = {'query': '{}'.format(word)}
try:
response = requests.request("POST", url, headers=headers, data=query)
except KeyError as ke:
print(ke, 'Out of calls, next key...')
ids_keys.pop(api_id) # drop current api id & key from dict if out of calls
print('API keys left:', len(ids_keys))
finally:
stats = response.json()
print('Food Stats: \n', stats)
print('Calories in food: ', stats['foods'][0]['nf_calories'])
calories.append(stats['foods'][0]['nf_calories'])
print('Current Key', api_id, ':', api_key)
col.append(calories)
if len(col) == len(data['post']):
break
I attempted to use the while loop to only append up to the length of the dataset, but to no avail.
Original Data Set:
pd.DataFrame({'user':['avskk', 'janejellyn', 'firlena227','...'],
'date': ['October 22', 'October 22', 'October 22','...'],
'post': [['autumn', 'fully', 'arrived', 'cooking', 'breakfast', 'toaster','...'],
['breakfast', 'chinese', 'sticky', 'rice', 'tempeh', 'sausage', 'cucumber', 'salad', 'lunch', 'going', 'lunch', 'coworkers', 'probably', 'black', 'bean', 'burger'],
['potato', 'bean', 'inspiring', 'food', 'day', 'today', '...']]
})
New Column:
pd.DataFrame({'Calories': [[22,33,45,32,2,5,7,9,76],
[43,78,54,97,32,56,97],
[23,55,32,22,7,99,66,98,54,35,33]]
})

Python: KeyError when Calling Valid Key/Index in Dict

I have some JSON data that I'm pulling from a websocket:
while True:
result = ws.recv()
result = json.loads(result)
Here is Print(result):
{'type': 'ticker', 'sequence': 4779671311, 'product_id': 'BTC-USD', 'price': '15988.29000000', 'open_24h': '14566.71000000', 'volume_24h': '18276.75612545', 'low_24h': '15988.29000000', 'high_24h': '16102.00000000', 'volume_30d': '1018642.48337033', 'best_bid': '15988.28', 'best_ask': '15988.29', 'side': 'buy', 'time': '2018-01-05T15:38:21.568000Z', 'trade_id': 32155934, 'last_size': '0.02420000'}
Now I want to access the 'price' value.
print (result['price'])
This results with a KeyError:
File "C:/Users/Selzier/Documents/Python/temp.py", line 43, in <module>
print (result['price'])
KeyError: 'price'
However, if I perform a loop on the (results) data, then I can successfully print both i and result[i]
for i in result:
if i == "price":
print (i)
print (result[i])
Which will print the following data:
price
16091.00000000
Why do I get a 'KeyError' when calling:
result['price']
AND
result[0]
When I'm not inside of the 'for i in result' loop?
Create a guard in while True loop, like in for loop:
while True:
result = ws.recv()
result = json.loads(result)
if result and 'price' in result:
print(result['price'])
...
(read my comment)

Missing one element of dictionary cause of If statement

i use this code to store items of dictionaries in doc variable.
This code works fine but I miss the first element of time because of the if statement.
def convert(old):
time_key = 'Time '
# Save the time
time_item = (time_key, old[time_key])
# Add remove it
del old[time_key]
# Copy remaining items to new dicts and save them in a list
return [dict([time_item, item]) for item in old.items()]
row = {
'Time ': '2017-12-01T13:54:04',
'Energy [kWh]': '0.01',
'Voltage [V]': '221.64',
'Current [A]': '0.08',
}
new_data = convert(row)
#print(new_data)
Zeitvalue= ""
Device=""
Value=""
for d in new_data:
#print(d)
for key, value in d.items():
if key == 'Time ':
Zeitvalue = value
#print(value)
continue
else:
Device = key
Value = value
doc = {'Time ':Zeitvalue,'Device':Device, 'Measure':Value}
print("This is doc variable:",doc) # doc vaiable with missed time element
SO when i print doc i got this
Output:
doc: {'Device': 'Voltage [V]', 'Measure': '221.64', 'Time ': ''} # **ISSUE: variable time is missed here, How to fix it ?**
doc: {'Device': 'Current [A]', 'Measure': '0.08', 'Time ': '2017-12-01T13:54:04'}
doc: {'Device': 'Energy [kWh]', 'Measure': '0.01', 'Time ': '2017-12-01T13:54:04'}
See the below changes in the code. Remove continue statement. Also assign value to doc after the inner loop for dictionary is over as you need all three values.
for d in new_data:
#print(d)
for key, value in d.items():
if key == 'Time ':
Zeitvalue = value
#print(value)
else:
Device = key
Value = value
doc = {'Time ':Zeitvalue,'Device':Device, 'Measure':Value}
print(doc)
if you are just setting values then place the doc assignment outside the for loop
for d in new_data:
for key, value in d.items():
if key == 'Time ':
Zeitvalue = value
continue
else:
Device = key
Value = value
doc = {'Time ':Zeitvalue,'Device':Device, 'Measure':Value}
you have problem in this line:
doc = {'Time ':Zeitvalue,'Device':Device, 'Measure':Value} when you use it inside the for loop! , each iteration overrides the previous assignment, furthermore - you cause unexpected behavior , since dictionary is not order data structure - meaning : if you encountered "tine" key first - it will work fine , but if you did not encountered 'time' first - the value of it is still == "" , since you initiate it to that value and you did not updated it since.
move the doc = {'Time ':Zeitvalue,'Device':Device, 'Measure':Value} to the outer loop , and not the one going over each key and value and you will be fine.

Categories