Sorting API response with python to excel or csv. Python - python

I'm trying to sort out UK Police free API response to a readable format-csv or excel.
Im using Requests library. My initial code is getting the response in a json format:
import requests
r=requests.get('https://data.police.uk/api/crimes-street/all-crime?poly=51.169,-0.633:51.186,-0.5436:51.226,-0.6224&date=2019-12')
r_json=r.json()
for i in j:
for key,value in i.items():
print (key, ":", value)
The code above produces as follows:
category : anti-social-behaviour location_type : Force location : {'latitude': '51.196818', 'street': {'id': 1147343, 'name': 'On or near Parking Area'}, 'longitude': '-0.605146'} context : outcome_status : None persistent_id : id : 79955592 location_subtype : month : 2019-12
How can I create a table with correct headers for the response I get? Headers would be 'category', 'latitude', 'street', 'name', 'longitude', ' month'.

You need to get dipper in dictionary tree to get some data like latitude. Results are collected into collection of lists then loaded into data frame and saved as csv file.
import requests
import pandas as pd
r=requests.get('https://data.police.uk/api/crimes-street/all-crime?poly=51.169,-0.633:51.186,-0.5436:51.226,-0.6224&date=2019-12')
r_json=r.json()
# collect data into list of lists
collected_data = []
for data in r_json:
category = data.get('category')
month = data.get('month')
latitude = ''
longitude = ''
street = ''
for key, value in data.items():
if key == 'location':
latitude = value.get('latitude')
longitude = value.get('longitude')
street = value.get('street').get('name')
collected_data.append([category, latitude, longitude, street, month])
# load data into data frame
df = pd.DataFrame(collected_data, columns = ['Category' , 'Latitude', 'Longitude', 'Street', 'Month'])
# save data frame into csv
df.to_csv('data.csv')

Related

Matching thousands of data takes too much time with Pandas

I receive every day a report with some values and I have to match postal codes from countries all over the world to get the right region. Then I upload the result in my Django app.
Here's a look at my report:
Order Number
Date
City
Postal code
930276
27/09/2022
Madrid
cp: 28033
929670
27/09/2022
Lisboa
cp: 1600-812
I have thousands of rows like this. The objective is to retrieve the region in ISO 3166-2 format. To help me, I accessed the following page Geonames and downloaded all the countries' information (example: "FR.txt", "ES.txt"...)
Because this is a huge txt file, I chose to store it on a S3 Server.
Here is what I tried:
def access_scaleway(region_name, endpoint_url, access_key, secret_key):
""" Accessing Scaleway Bucket """
scaleway = boto3.client('s3', region_name=region_name, endpoint_url=endpoint_url, aws_access_key_id=access_key,
aws_secret_access_key=secret_key)
return scaleway
def get_region_code_accessing_scaleway(countries, regions):
''' Retrieves the region code from the region name. '''
list_countries = countries
list_regions = regions
list_regions_codes = []
scaleway_session = access_scaleway(region_name=settings.SCALEWAY_S3_REGION_NAME,
endpoint_url=settings.SCALEWAY_S3_ENDPOINT_URL,
access_key=settings.SCALEWAY_ACCESS_KEY_ID,
secret_key=settings.SCALEWAY_SECRET_ACCESS_KEY)
for country, region in zip(list_countries, list_regions):
try:
obj = scaleway_session.get_object(Bucket=settings.SCALEWAY_STORAGE_BUCKET_NAME, Key=f'countries/{country}.txt')
df = pd.read_csv(io.BytesIO(obj['Body'].read()), sep='\t', header=None)
df.columns = ['country code', 'postal code', 'place name', 'admin name1', 'admin code1', 'admin name2', 'admin code2', 'admin name3', 'admin code3', 'latitude', 'longitude', 'accuracy']
df['postal code'] = df['postal code'].astype(str)
df['postal code'] = df['postal code'].str.zfill(5)
# Removing all spaces and special characters
postal_code = re.sub("[^0-9^-]", '', region).strip()
region_code = country + "-" + df[df['postal code'] == postal_code]['admin code1'].values[0]
list_regions_codes.append(region_code)
except AttributeError:
list_regions_codes.append(None)
except ValueError:
list_regions_codes.append(None)
return list_regions_codes
But it is way too long. For a simple report of 1000 rows, it takes like 30 min.
My second try was to go with the OpenDataSoft public API. Here is what I tried:
def fetch_data(url, params, headers=None):
response = requests.get(url=url, params=params, headers=headers)
return response
def get_region_code_accessing_scaleway(countries, regions):
''' Retrieves the region code from the region name. '''
list_countries = countries
list_regions = regions
list_regions_codes = []
for country, region in zip(list_countries, list_regions):
try:
#Get response from API
postal_code = re.sub("[^0-9^-]", '', region).strip()
response = fetch_data(
url="https://data.opendatasoft.com/api/v2/catalog/datasets/geonames-postal-code%40public/records?",
params="select=country_code%2C%20postal_code%2C%20admin_code1&where=country_code%3D%22" + country + "%22%20and%20postal_code%3D%22" + postal_code + "%22")
if response.status_code == 200:
data = response.json()
if len(data['records']) > 0:
list_regions_codes.append(country + "-" + data['records'][0]['record']['fields']['admin_code1'])
else:
list_regions_codes.append(None)
else:
print('Error:" + response.status_code')
list_regions_codes.append(None)
But once again, it takes like forever to get matching values.
The last thing I tried was to go with pgeocode but it was also too long.
I don't understand why it is so long because the desired output is this one:
Order Number
Date
City
Postal code
Region code
930276
27/09/2022
Madrid
cp: 28033
ES-MD
929670
27/09/2022
Lisboa
cp: 1600-812
PT-08
Do you have any idea to speed up the process?

Getting the sum of a csv column without pandas in python

I have a csv file passed into a function as a string:
csv_input = """
quiz_date,location,size
2022-01-01,london_uk,134
2022-01-02,edingburgh_uk,65
2022-01-01,madrid_es,124
2022-01-02,london_uk,125
2022-01-01,edinburgh_uk,89
2022-01-02,madric_es,143
2022-01-02,london_uk,352
2022-01-01,edinburgh_uk,125
2022-01-01,madrid_es,431
2022-01-02,london_uk,151"""
I want to print the sum of how many people were surveyed in each city by date, so something like:
Date. City. Pop-Surveyed
2022-01-01. London. 134
2022-01-01. Edinburgh. 214
2022-01-01. Madrid. 555
2022-01-02. London. 628
2022-01-02. Edinburgh. 65
2022-01-02. Madrid. 143
As I can't import pandas on my machine (can't install without internet access) I thought I could use a defaultdict to store the value of each city by date
from collections import defaultdict
survery_data = csv_input.split()[1:]
survery_data = [survey.split(',') for survey in survery_data]
survey_sum = defaultdict(dict)
for survey in survery_data:
date = survey[0]
city = survey[1].split("_")[0]
quantity = survey[-1]
survey_sum[date][city] += quantity
print(survey_sum)
But doing this returns a KeyError:
KeyError: 'london'
When I was hoping to have a defaultdict of
{'2022-01-01': {'london': 134}, {'edinburgh': 214}, {'madrid': 555}},
{'2022-01-02': {'london': 628}, {'edinburgh': 65}, {'madrid': 143}}
Is there a way to create a default dict that gives a structure so I could then iterate over to print out each column like above?
Try:
csv_input = """\
quiz_date,location,size
2022-01-01,london_uk,134
2022-01-02,edingburgh_uk,65
2022-01-01,madrid_es,124
2022-01-02,london_uk,125
2022-01-01,edinburgh_uk,89
2022-01-02,madric_es,143
2022-01-02,london_uk,352
2022-01-01,edinburgh_uk,125
2022-01-01,madrid_es,431
2022-01-02,london_uk,151"""
header, *rows = (
tuple(map(str.strip, line.split(",")))
for line in map(str.strip, csv_input.splitlines())
)
tmp = {}
for date, city, size in rows:
key = (date, city.split("_")[0])
tmp[key] = tmp.get(key, 0) + int(size)
out = {}
for (date, city), size in tmp.items():
out.setdefault(date, []).append({city: size})
print(out)
Prints:
{
"2022-01-01": [{"london": 134}, {"madrid": 555}, {"edinburgh": 214}],
"2022-01-02": [{"edingburgh": 65}, {"london": 628}, {"madric": 143}],
}
Changing
survey_sum = defaultdict(dict)
to
survey_sum = defaultdict(lambda: defaultdict(int))
allows the return of
defaultdict(<function survey_sum.<locals>.<lambda> at 0x100edd8b0>, {'2022-01-01': defaultdict(<class 'int'>, {'london': 134, 'madrid': 555, 'edinburgh': 214}), '2022-01-02': defaultdict(<class 'int'>, {'edingburgh': 65, 'london': 628, 'madrid': 143})})
Allowing iterating over to create a list.

How to extract text and save as excel file using python or JavaScript

How do I extract text from this PDF files where some data is in the form of table while some are key value based data
eg:
https://drive.internxt.com/s/file/78f2d73478b832b2ab55/3edb275967deeca6ad33e7d53f2337c50d5dfb50e0aa525bb7f10d49dff1e2b4
This is what I have tried :
import PyPDF2
import openpyxl
from openpyxl import Workbook
pdfFileObj = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = Workbook()
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('sample.xlsx')
print('Save')
However I'd like the data to be stored in the following format.
This pdf does not have well defined tables, hence cannot use any tool to extract the entire data in one table format. What we can do is read the entire pdf as text. And process each data fields line by line by using regex to extract the data.
Before you move ahead, please install the pdfplumber package for python
pip install pdfplumber
Assumptions
Here are some assumptions that I made for your pdf and accordingly I have written the code.
First line will always contain the title Account History Report.
Second line will contain the names IMAGE All Notes
Third line will contain only the data Date Created in the form of key:value.
Fourth line will contain only the data Number of Pages in the form of key:value.
Fifth line will only contain the data Client Code, Client Name
Starting line 6, a pdf can have multiple data entity, these data entity for eg in this pdf is 2 but can be any number of entity.
Each data entity will contain the following fields:
First line in data entity will contain only the data Our Ref, Name, Ref 1, Ref 2
Second line line will only contain data in the form as present in pdf Amount, Total Paid, Balance, Date of A/C, Date Received
Third line in data entity will contain the data Last Paid, Amt Last Paid, Status, Collector.
Fourth line will contain the column name Date Notes
The subsequent lines will contain data in the form of table until the next data entity is started.
I also assume that each data entity will contain the first data with key Our Ref :.
I assume that the data entity will be separated on the first line of each entity in the pattern of key values as Our Ref :Value Name: Value Ref 1 :Value Ref 2:value
pattern = r'Our Ref.*?Name.*?Ref 1.*?Ref 2.*?'
Please note that the rectangle that I have created(thick black) in above image, I am calling those as data entity.
The final data will be stored in a dictionary(json) where the data entity will have key as dataentity1, dataentity2, dataentity3 based on the number of entities you have in your pdf.
The header details are stored in the json as key:value and I assume that each key will be present in header only once.
CODE
Here is the simple elegant code, that gives you information from the pdf in the form of json. In the output the first few field contains information from the header part, subsequent data entities can be found as data_entity 1 and 2.
In the below code all you need to change is pdf_path.
import pdfplumber
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
def preprocess_data(data):
return [el.strip() for el in data.splitlines() if el.strip()]
def get_header_data(text, json_data = {}):
header_data_list = preprocess_data(text)
# third line in text of header contains Date Created field
json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def iterate_through_regex_and_populate_dictionaries(data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def populate_date_notes(data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
data_index = 1
json_data = {}
pdf_path = r'C:\Users\hpoddar\Desktop\Temp\sample3.pdf' # ENTER YOUR PDF PATH HERE
pdf_text = ''
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
if(__name__ == '__main__'):
with pdfplumber.open(pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(data_entity_sep_pattern, pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
get_header_data(split_on_data_entity[0], json_data)
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = preprocess_data(split_on_data_entity[data_index])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
populate_date_notes(data_entity, data_processed)
json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
print(json_data)
Output :
Result string :
{'Date Created': '18/04/2022', 'Number of Pages': '4', 'Client Code': '110203', 'ClientName': 'AWS PTE. LTD.', 'data_entity1': {'Our Ref': '2118881115', 'Name': 'Sky Blue', 'Ref 1': '12-34-56789-2021/2', 'Ref 2': 'F2021004444', 'Amount': '$100.11', 'Total Paid': '$0.00', 'Balance': '$100.11', 'Date of A/C': '01/08/2021', 'Date Received': '10/12/2021', 'Last Paid': '', 'Amt Last Paid': '', 'A/C Status': 'CLOSED', 'Collector': 'Sunny Jane', 'Date': ['04/03/2022'], 'Notes': ['Letter Dated 04 Mar 2022.']}, 'data_entity2': {'Our Ref': '2112221119', 'Name': 'Green Field', 'Ref 1': '98-76-54321-2021/1', 'Ref 2': 'F2021001111', 'Amount': '$233.88', 'Total Paid': '$0.00', 'Balance': '$233.88', 'Date of A/C': '01/08/2021', 'Date Received': '10/12/2021', 'Last Paid': '', 'Amt Last Paid': '', 'A/C Status': 'CURRENT', 'Collector': 'Sam Jason', 'Date': ['11/03/2022', '11/03/2022', '08/03/2022', '08/03/2022', '21/02/2022', '18/02/2022', '18/02/2022'], 'Notes': ['Email for payment', 'Case Status', 'to send a Letter', '845***Ringing, No reply', 'Letter printed - LET: LETTER 2', 'Letter sent - LET: LETTER 2', '845***Line busy']}}
Now once you got the data in the json format, you can load it in a csv file, as a data frame or whatever format you need the data to be in.
Save as xlsx
To save the same in a xlsx file in the format as shown in the image in the question above. We can use xlsx writer to do the same.
Please install the package using pip
pip install xlsxwriter
From the previous code, we have our entire data in the variable json_data, we will be iterating through all the data entities and write the data to appropriate cell specified by row, col in the code.
import xlsxwriter
workbook = xlsxwriter.Workbook('Sample.xlsx')
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write columns
columns = ['Account History Report', 'All Notes'] + [ key for key in json_data.keys() if 'data_entity' not in key ] + list(json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(json_data.get(data_entity_key) != None):
for key, value in json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
Result :
The above code creates a file sample.xlsx in the working directory.

How to get street name from osm.pbf file in OpenStreetMap

You can download any dataset from here https://download.geofabrik.de/australia-oceania.html
Here's my code
import osmium as osm
import pandas as pd
class OSMHandler(osm.SimpleHandler):
def __init__(self):
osm.SimpleHandler.__init__(self)
self.osm_data = []
def tag_inventory(self, elem, elem_type):
for tag in elem.tags:
self.osm_data.append([elem_type,
elem.id,
elem.version,
elem.visible,
pd.Timestamp(elem.timestamp),
elem.uid,
elem.user,
elem.changeset,
len(elem.tags),
tag.k,
tag.v])
def node(self, n):
self.tag_inventory(n, "node")
def way(self, w):
self.tag_inventory(w, "way")
def relation(self, r):
self.tag_inventory(r, "relation")
osmhandler = OSMHandler()
# scan the input file and fills the handler list accordingly
osmhandler.apply_file("/DATA/user/nabih/pitcairn-islands-latest.osm.pbf")
# transform the list into a pandas DataFrame
data_colnames = ['type', 'id', 'version', 'visible', 'ts', 'uid',
'user', 'chgset', 'ntags', 'tagkey', 'tagvalue']
df_osm = pd.DataFrame(osmhandler.osm_data, columns=data_colnames)
Here's the df_osm
Street names are values of the name key of highway elements (see https://wiki.openstreetmap.org/wiki/Map_features#Highway for all possible highway types, you may want to further filter it in the query). You can then self join all highway rows with their name rows on id:
df_osm.loc[df_osm.tagkey=='highway', ['id', 'tagvalue']].merge(
df_osm.loc[df_osm.tagkey=='name', ['id', 'tagvalue']],
on='id', suffixes=['_kind', '_name'])
Result for pitcairn-islands-latest.osm.pbf:
id tagvalue_kind tagvalue_name
0 1034153953 residential Main Road
1 1034161481 residential Hill of Difficulty Road
If you want to also include national names you can replace df_osm.tagkey=='name' with df_osm.tagkey.str.startswith('name'). See https://wiki.openstreetmap.org/wiki/Key:name for details and other possible names.

Check for a build up name if it exist in a list of names -python

I'm trying to build a cvs file that has all Active directory fields I need, from 2 external files.
the first file has a list of users that need to be created and some other info relevant to an AD object
and the second report is a list of exported SamAccountName and emails dumped from AD. So what I want to do is create a unique SamAccountName, I form my test SamaAccountName from the firstname and lastname of the first report and wan to compare it vs the second report. I'm currently storing ins alist all the data I get from the second report and I want to check my generated SamAccountName exists in that list
so far I'm not able to do so and only get a csv with the SamAccoutnNames I made up( it does not do the check )
note: I can't use any other plugin to check directly to Active Directory
import csv
def getSamA(fname, lname):
Sams = []
sama = lname[0:5] + fname[0:2]
with open('test-input.txt','r') as AD:
rows = csv.DictReader(AD)
for ad in rows:
Sams.append(ad['SamAccountName'])
#check if built sama is in list
if sama in Sams:
#if sama in list, add one more character to sama
sama = lname[0:5] + fname[0:3]
return sama.lower()
else:
return sama.lower()
with open('users.csv') as csv_file:
rows = csv.DictReader(csv_file)
with open('users-COI2-Multi.csv', 'w', newline='') as output:
header = ['FirstName','Initials','LastName','DisplayName','Description','Office','TelePhone','UserLogonName','SamAccountName','JobTitle','Department','Manager','Mobile','faxNumber','Notes','Assistant','employeeID','ex1','ex2','ex3','ex15','Office365License','ExpiresInDays','EmailToUSer','AddToGroup']
output_file = csv.DictWriter(output, fieldnames=header, delimiter=';')
output_file.writeheader()
for data in rows:
employeeId = data['Associate ID']
fName = data['First Name']
lName = data['Last Name']
Location = data['Location']
Department = data['Department']
Manager = data['Manager Name']
JobTitle = data['Title']
context = {
'FirstName' : fName,
'Initials' : getInitials(fName, lName),
'LastName' : lName,
'DisplayName' : getDisplayName(fName, lName),
'Description' : 'Account for: '+getDisplayName(fName, lName),
'Office': getOffice(Location).strip(),
'TelePhone' : '+1 XXX XXX XXXX',
'UserLogonName' : getMail(fName, lName),
'SamAccountName' : getSamA(fName, lName),
'JobTitle' : JobTitle,
'Department' : Department,
'Manager' : Manager,
'Mobile' : '',
'faxNumber' : '',
'Notes' : '',
'Assistant' : '',
'employeeID' : employeeId,
'ex1' : 'End User',
'ex2' : 'NoMailbox',
'ex3' : getSiteCode(Location),
'ex15' : getSKID(Location),
'Office365License' : '',
'ExpiresInDays' : '',
'EmailToUSer' : 'user#test.com',
'AddToGroup' : '',
}
output_file.writerow(context)

Categories