convert dictionary into xls file using python openpyxl library - python

Below is my dictionary & I want to write the key-value pair of dictionary into excel sheet in two columns named key & hourly.
'One year reserved'= {
'Australia Central 2': 0.0097,
'East US 2': 0.00605,
'North Central US': 0.00605,
'South Africa West': 0.01016,
'UK West': 0.00685,
'France South': 0.01119,
'Korea': 0.00639,
'Canada East': 0.00685,
'US Gov Virginia': 0.00879,
'East Asia': 0.0097,
'South India': 0.01005,
'South Central US': 0.00731,
'West US': 0.00719,
'Australia East': 0.00776,
'Canada Central': 0.00674,
'Australia Southeast': 0.00776,
'Southeast Asia': 0.00776,
'Central US': 0.00731,
'West India': 0.00833,
'East US': 0.00605,
'Australia Central': 0.0097,
'UK South': 0.00685,
'Japan East': 0.00799,
'Japan West': 0.00879,
'West Europe': 0.00696,
'Brazil South': 0.00982,
'Korea Central': 0.00799,
'US Gov Texas': 0.00879,
'US Gov Arizona': 0.00879,
'Central India': 0.00833,
'North Europe': 0.00822,
'West Central US': 0.00731,
'France Central': 0.00856,
'South Africa North': 0.00811,
'West US 2': 0.00605
}
convert dictionary into xls file using python openpyxl library.
output should be like this:-
**Key** **Hourly**
Australia Central 2 0.008
East US 2 0.00605
North Central US  0.00605

Here is a solution working with python 3.6+ because it uses f-strings. enumerate is used not to have to store the row number in another reference.
from openpyxl import Workbook
data = {
'australia-central': 0.0097,
'usgov-virginia': 0.00879,
}
workbook = Workbook()
sheet = workbook.active
sheet["A1"] = "Key"
sheet["B1"] = "Hourly"
for row, (key, hourly) in enumerate(data.items(), start=2):
sheet [f"A{row}"] = key
sheet [f"B{row}"] = hourly
workbook.save("output.xlsx")

import csv
one_year_reserved = {
'australia-central': 0.0097,
'usgov-virginia': 0.00879,
'us-south-central': 0.00731,
'france-south': 0.01119,
'us-west': 0.00719,
'europe-north': 0.00822,
'asia-pacific-east': 0.0097,
'japan-east': 0.00799,
'west-india': 0.00833,
'united-kingdom-west': 0.00685,
'usgov-arizona': 0.00879,
'brazil-south': 0.00982,
'australia-east': 0.00776,
'us-west-2': 0.00605,
'asia-pacific-southeast': 0.00776,
'south-india': 0.01005,
'us-central': 0.00731,
'us-east-2': 0.00605,
'south-africa-west': 0.01016,
'canada-central': 0.00674,
'south-africa-north': 0.00811,
'canada-east': 0.00685,
'us-east': 0.00605,
'korea-south': 0.00639,
'united-kingdom-south': 0.00685,
'europe-west': 0.00696,
'japan-west': 0.00879,
'australia-southeast': 0.00776,
'us-west-central': 0.00731,
'us-north-central': 0.00605,
'central-india': 0.00833,
'korea-central': 0.00799,
'usgov-texas': 0.00879,
'france-central': 0.00856,
'australia-central-2': 0.0097
}
with open('output2.csv', 'wb') as output:
writer = csv.writer(output)
for key, value in one_year_reserved.items():
writer.writerow([key, value])

Try this code . It works fine !
# Writing to an excel
# sheet using Python
import xlwt
from xlwt import Workbook
# Workbook is created
wb = Workbook()
#dictionary
sample_data= {
'Australia Central 2': 0.0097,
'East US 2': 0.00605,
'North Central US': 0.00605,
'South Africa West': 0.01016,
'UK West': 0.00685,
'France South': 0.01119,
'Korea': 0.00639,
'Canada East': 0.00685,
'US Gov Virginia': 0.00879,
'East Asia': 0.0097,
'South India': 0.01005,
'South Central US': 0.00731,
'West US': 0.00719,
'Australia East': 0.00776,
'Canada Central': 0.00674,
'Australia Southeast': 0.00776,
'Southeast Asia': 0.00776,
'Central US': 0.00731,
'West India': 0.00833,
'East US': 0.00605,
'Australia Central': 0.0097,
'UK South': 0.00685,
'Japan East': 0.00799,
'Japan West': 0.00879,
'West Europe': 0.00696,
'Brazil South': 0.00982,
'Korea Central': 0.00799,
'US Gov Texas': 0.00879,
'US Gov Arizona': 0.00879,
'Central India': 0.00833,
'North Europe': 0.00822,
'West Central US': 0.00731,
'France Central': 0.00856,
'South Africa North': 0.00811,
'West US 2': 0.00605
}
# add_sheet is used to create sheet.
sheet1 = wb.add_sheet('Sheet 1')
#general syntax
#sheet1.write(column, row, value)
sheet1.write(0, 0, 'Key')
sheet1.write(1, 0, 'Hourly')
row = 1
#iterate the each key-value pair of dictionary & insert into sheet
for k, v in dict.items():
sheet1.write(0, row, k)
sheet1.write(1, row, v)
row = row + 1
wb.save('xlwt example.xls')

Related

Unable to use pycountry search_fuzzy to look up the alpha_3 based on given countries names, any better ideas?

I have the following 122 countries which I couldn't look up their corresponding alpha 3 code. I tried search_fuzzy but nothing is found.
By looking at some of the countries names, I can "manually" assign the alpha 3 based on common knowledge (such like creating a dic for rename). However, I wonder if there is a better way to look up the alpha 3 in an automated way, such like by using other function from pycoutry or even re?
Any suggestions and advice are greatly appreciated.
import pandas as pd
import numpy as np
import regex as re
import pycountry
missing = ['Americas', 'Asia', 'Australia and New Zealand', 'Bolivia (Plurinational State of)', 'Caribbean', 'Central America', 'Central and Southern Asia', 'Central Asia', 'China, Hong Kong Special Administrative Region', 'China, Macao Special Administrative Region', 'Democratic Republic of the Congo', 'Eastern Africa', 'Eastern and South-Eastern Asia', 'Eastern Asia', 'Eastern Europe', 'Europe', 'Europe and Northern America', 'Iran (Islamic Republic of)', 'Landlocked developing countries (LLDCs)', 'Latin America and the Caribbean', 'Least Developed Countries (LDCs)', 'Melanesia', 'Micronesia (Federated States of)', 'Middle Africa', 'Northern Africa', 'Northern Africa and Western Asia', 'Northern America', 'Northern Europe', 'Oceania', 'Oceania (exc. Australia and New Zealand)', 'Small island developing States (SIDS)', 'South America', 'South-Eastern Asia', 'Southern Africa', 'Southern Asia', 'Southern Europe', 'Sub-Saharan Africa', 'Türkiye', 'Venezuela (Bolivarian Republic of)', 'Western Africa', 'Western Asia', 'Western Europe', 'World', 'European Union (27)', 'Chinese Taipei', 'UAE', 'Belgium-Luxembourg', 'Channel Islands', 'China, Hong Kong SAR', 'China, Macao SAR', 'China, mainland', 'China, Taiwan Province of', 'Czechoslovakia', 'Ethiopia PDR', 'French Guyana', 'Netherlands Antilles (former)', 'Pacific Islands Trust Territory', 'Serbia and Montenegro', 'Sudan (former)', 'Svalbard and Jan Mayen Islands', 'United States Virgin Islands', 'USSR', 'Wallis and Futuna Islands', 'Yugoslav SFR', 'Global average', 'Cocos Islands', 'Macquarie Island', 'Northern Mariana Islands and Guam', 'Comoro Islands', 'Glorioso Islands', 'Juan de Nova Island', 'Bassas da India', 'Ile Europa', 'Ile Tromelin', 'Azores', 'Cape Verde', 'Canary Islands', 'Prince Edward Islands', 'Crozet Islands', 'Amsterdam Island and Saint Paul Island', 'Kerguelen Islands', 'Heard and McDonald Islands', 'Republique du Congo', 'Clipperton Island', 'Puerto Rico and Virgin Islands of the United States', 'Guadeloupe and Martinique', 'Faeroe Islands', 'Line Islands (Kiribati)', 'Phoenix Islands (Kiribati)', 'Howland Island and Baker Island', 'Guinea Bissau', 'Ivory Coast', 'Gilbert Islands (Kiribati)', 'Northern Saint-Martin', 'East Timor', 'Oecussi Ambeno', 'Laos', 'Republic of Congo', 'Dem. Rep. Congo', 'ASEAN', 'BRIICS', 'DRC', 'EA19', 'EECCA', 'EU27_2020', 'European Union', 'G20', 'G7M', 'Lao PDR', 'OECD', 'OECDAM', 'OECDAO', 'OECDE', 'Grenade', 'Korea, Rep.', 'Egypt, Arab Rep.', 'Iran, Islamic Rep.', 'Korea (Rep.)', 'Hong Kong, China', 'Iran (Islamic Republic)', 'Cote dIvoire', 'Congo (Democratic Republic)']
not_found = []
for country in missing:
try:
print(pycountry.countries.search_fuzzy(country))
print(country)
except:
print('not found')
not_found.append(country)
print(len(missing)) #122
print(len(not_found)) #122

Iterating through multiple pages when web scraping pagination with python

I'm trying to extract specific data from a webpage that typically contains multiple pages. Although I was able to print all of the information I needed on the first page, I couldn't do the same for other pages. I searched the internet for solutions and discovered that the majority of them looped through each page by concatenating a link page with a number.
However, I'm working on a website where the link page does not change when you navigate to different pages. Therefore, it's difficult for me to figure out which attribute causes the URL to redirect to the second page as there are no clickable links displayed.
When I inspect the look-alike next button, I get the following:
<div class="pagination__PageNavItem-s1515b5x-2 clogRN"><span class="pagination__PageNavigation-s1515b5x-3 cKpakR">→</span></div>
I was able to get the information I needed for the first page here:
import requests
from bs4 import BeautifulSoup
url = 'https://www.flightstats.com/v2/flight-tracker/arrivals/LHR/?year=2021&month=7&date=3&hour=12?page=12323213'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')
airline_text = soup.find_all('div', {"class": "table__Cell-s1x7nv9w-13 iZEpOT"})
for n, i in enumerate(airline_text, start=1):
print(n, '->', i.get_text())
Is there a way to iterate through the remaining pages?
There's a script tag which contain the desired information, I've parsed it using regex where there's property called name contain the airline name.
import requests
import re
from pprint import pp
def main(url):
params = {
"year": "2021",
"month": "7",
"date": "3",
"hour": "12"
}
r = requests.get(url, params=params)
match = re.findall(r'"name":"(.*?)"', r.text)
pp(match)
main('https://www.flightstats.com/v2/flight-tracker/arrivals/LHR/')
Output:
['London Heathrow Airport',
'Qatar Airways',
'British Airways',
'American Airlines',
'Aer Lingus',
'Qatar Airways',
'British Airways',
'American Airlines',
'JAL',
'British Airways',
'British Airways',
'American Airlines',
'Emirates',
'Qantas',
'British Airways',
'Iberia',
'British Airways',
'American Airlines',
'Iberia',
'Qatar Airways',
'Royal Jordanian',
'Finnair',
'Qatar Airways',
'British Airways',
'Qatar Airways',
'Iberia',
'American Airlines',
'British Airways',
'SWISS',
'Air Canada',
'United Airlines',
'British Airways',
'ANA',
'Aegean Airlines',
'United Airlines',
'American Airlines',
'Finnair',
'Iberia',
'Qatar Airways',
'United Airlines',
'British Airways',
'Lufthansa',
'Aer Lingus',
'Air Canada',
'British Airways',
'Etihad Airways',
'British Airways',
'Qatar Airways',
'American Airlines',
'Iberia',
'Qatar Airways',
'Gulf Air',
'Fiji Airways',
'British Airways',
'Finnair',
'Alaska Airlines',
'Royal Jordanian',
'EL AL',
'Royal Jordanian',
'British Airways',
'American Airlines',
'Iberia',
'Qatar Airways',
'American Airlines',
'Xiamen Airlines',
'Iberia',
'British Airways',
'Qatar Airways',
'British Airways',
'American Airlines',
'Iberia',
'JAL',
'JAL',
'American Airlines',
'British Airways',
'British Airways',
'United Airlines',
'ANA',
'Iberia',
'Malaysia Airlines',
'Qatar Airways',
'Royal Jordanian',
'American Airlines',
'Finnair',
'SWISS',
'British Airways',
'American Airlines',
'Finnair',
'Aer Lingus',
'Iberia',
'Kuwait Airways',
'Xiamen Airlines',
'Garuda Indonesia',
'American Airlines',
'British Airways',
'Malaysia Airlines',
'China Airlines',
'KLM',
'Gol',
'Virgin Atlantic',
'Delta Air Lines',
'American Airlines',
'Cathay Pacific',
'British Airways',
'British Airways',
'JAL',
'Qatar Airways',
'Finnair',
'Pakistan International Airlines',
'United Airlines',
'Air Canada',
'EgyptAir',
'TAP Air Portugal',
'British Airways',
'TAROM',
'British Airways',
'American Airlines',
'Qatar Airways',
'Delta Air Lines',
'Iberia',
'Air France',
'British Airways',
'Aeromexico',
'KLM',
'Virgin Atlantic',
'Singapore Airlines',
'British Airways',
'JAL',
'American Airlines',
'Aer Lingus',
'British Airways',
'British Airways',
'British Airways',
'British Airways',
'British Airways',
'American Airlines',
'British Airways',
'Lufthansa',
'American Airlines',
'United Airlines',
'Croatia Airlines',
'Malaysia Airlines',
'JAL',
'Iberia',
'Finnair',
'Aegean Airlines',
'Cathay Pacific',
'British Airways',
'British Airways',
'American Airlines',
'Finnair',
'British Airways',
'Malaysia Airlines',
'American Airlines',
'Cathay Pacific',
'Emirates',
'Saudia',
'American Airlines',
'Cathay Pacific',
'LATAM Airlines',
'British Airways',
'British Airways',
'Qatar Airways',
'Cathay Pacific',
'Iberia',
'Gulf Air',
'British Airways',
'Finnair',
'Qatar Airways',
'Royal Jordanian',
'Royal Jordanian',
'American Airlines',
'British Airways',
'American Airlines',
'Malaysia Airlines',
'British Airways',
'Iberia',
'American Airlines',
'Singapore Airlines',
'American Airlines',
'British Airways',
'TAP Air Portugal',
'Aegean Airlines',
'British Airways',
'Iberia',
'Azores Airlines',
'TAP Air Portugal',
'TAP Air Portugal',
'Singapore Airlines',
'Air New Zealand',
'Air Canada',
'Virgin Atlantic',
'SAS',
'British Airways',
'British Airways',
'JAL',
'Croatia Airlines',
'Royal Air Maroc',
'Finnair',
'British Airways',
'LATAM Airlines',
'Malaysia Airlines',
'British Airways',
'American Airlines',
'Finnair',
'Aer Lingus',
'Iberia',
'Iberia',
'Qatar Airways',
'Aer Lingus',
'Air Canada',
'British Airways',
'United Airlines',
'Aeroflot',
'AZAL Azerbaijan Airlines',
'Etihad Airways',
'Iberia',
'Turkish Airlines',
'British Airways',
'American Airlines',
'Qantas',
'JAL',
'American Airlines',
'British Airways',
'Delta Air Lines',
'Alitalia',
'British Airways',
'LATAM Airlines',
'KLM',
'Garuda Indonesia',
'Virgin Atlantic',
'Qatar Airways',
'Qantas',
'Malaysia Airlines',
'Gol',
'JAL',
'Iberia',
'Aer Lingus',
'China Southern Airlines',
'Xiamen Airlines',
'British Airways',
'Delta Air Lines',
'Alitalia',
'Kenya Airways',
'Delta Air Lines',
'Virgin Atlantic',
'American Airlines',
'British Airways',
'Biman Bangladesh Airlines',
'ANA',
'Kenya Airways',
'Air France',
'Aeromexico',
'Gol',
'Virgin Atlantic',
'British Airways',
'Qatar Airways',
'British Airways',
'Iberia',
'British Airways',
'Royal Air Maroc',
'British Airways',
'Iberia',
'Qatar Airways',
'American Airlines',
'SriLankan Airlines',
'JAL',
'British Airways',
'American Airlines',
'Finnair',
'Iberia',
'British Airways',
'JAL',
'LATAM Airlines',
'British Airways',
'American Airlines',
'Qatar Airways',
'British Airways',
'British Airways',
'JAL',
'British Airways',
'JAL',
'American Airlines',
'SWISS',
'Etihad Airways',
'British Airways',
'British Airways',
'British Airways',
'Aer Lingus',
'Saudia',
'Ethiopian Airlines',
'TAP Air Portugal',
'Singapore Airlines',
'United Airlines',
'Azores Airlines',
'ANA',
'EgyptAir',
'EL AL',
'Etihad Airways',
'Korean Air',
'Royal Air Maroc',
'London Heathrow Airport']
The data is stored inside the page in <script> tag. You can use next example how to extract it:
import re
import json
import requests
url = "https://www.flightstats.com/v2/flight-tracker/arrivals/LHR/?year=2021&month=7&date=3&hour=12?page=12323213"
html_page = requests.get(url).text
data = re.search(r"__NEXT_DATA__ = (.*)", html_page).group(1)
data = json.loads(data)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for f in data["props"]["initialState"]["flightTracker"]["route"]["flights"]:
print(
"{:<8} {:<8} {:<3} {:<5}".format(
f["departureTime"]["time24"],
f["arrivalTime"]["time24"],
f["carrier"]["fs"],
f["carrier"]["flightNumber"],
)
)
Prints:
07:10 12:10 QR 8866
10:45 12:15 BA 827
10:45 12:15 AA 6472
10:45 12:15 EI 8327
10:45 12:15 QR 5952
11:00 12:20 BA 579
11:00 12:20 AA 6838
11:00 12:20 JL 7156
...

How retrieve location address using geopy?

I have following data frame and used a code from here
from geopy.geocoders import Nominatim
data = {'lat1': [116.51172,116.51135,116.51135,116.51627,116.47186],
'lon1': [39.92123,39.93883,39.93883,39.91034,39.91248]}
# Create DataFrame
df_test = pd.DataFrame(data)
geolocator = Nominatim(user_agent="geoapiExercises")
location = geolocator.reverse(df_test['lat']+","+ df_test['lon'], language='en')
address = location.raw['address']
df_test['suburb']= address.get('suburb', '')
df_test['postcode']= address.get('postcode', '')
df_test['road']= address.get('road', '')
I want to get 3 features from the location, however, got an error
ufunc 'add' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')
Could you help to get the necessary information?
Use pd.apply to apply the geolocator.reverse function to each row of your dataframe.
geopy gave an error that the latitude coordinates must be in the range [-90, 90], so I added latitude and longitude normalization
import pandas as pd
from geopy.geocoders import Nominatim
from math import sin, asin, fmod, pi
geolocator = Nominatim(user_agent="geoapiExercises")
data = {'lat1': [116.51172,116.51135,116.51135,116.51627,116.47186],
'lon1': [39.92123,39.93883,39.93883,39.91034,39.91248]}
# Create DataFrame
df = pd.DataFrame(data)
# latitude and longitude normalization according to formulas found at
# https://stackoverflow.com/a/31119445/50065
df['lat'] = df['lat1'].apply(lambda lat: asin(sin((lat/180.0)*pi)) * (180.0/pi))
df['lon'] = df['lon1'].apply(lambda lon: fmod(lon - 180.0, 360.0) + 180.0)
df['lat_lon'] = df['lat'].astype(str) + ',' + df['lon'].astype(str)
df['location'] = df['lat_lon'].apply(lambda lat_lon: geolocator.reverse(lat_lon, language='en'))
df['address'] = df['location'].apply(lambda loc: loc.raw['address'])
df['postcode'] = df['address'].apply(lambda addr: addr.get('postcode', 'no postcode'))
Output:
lat1
lon1
lat
lon
lat_lon
location
address
postcode
0
116.512
39.9212
63.4883
39.9212
63.488280000000024,39.92123000000001
Обозерское городское поселение, Plesetsky District, Arkhangelsk Oblast, Northwestern Federal District, 164254, Russia
{'municipality': 'Обозерское городское поселение', 'county': 'Plesetsky District', 'state': 'Arkhangelsk Oblast', 'region': 'Northwestern Federal District', 'postcode': '164254', 'country': 'Russia', 'country_code': 'ru'}
164254
1
116.511
39.9388
63.4887
39.9388
63.488650000000014,39.938829999999996
Обозерское городское поселение, Plesetsky District, Arkhangelsk Oblast, Northwestern Federal District, 164254, Russia
{'municipality': 'Обозерское городское поселение', 'county': 'Plesetsky District', 'state': 'Arkhangelsk Oblast', 'region': 'Northwestern Federal District', 'postcode': '164254', 'country': 'Russia', 'country_code': 'ru'}
164254
2
116.511
39.9388
63.4887
39.9388
63.488650000000014,39.938829999999996
Обозерское городское поселение, Plesetsky District, Arkhangelsk Oblast, Northwestern Federal District, 164254, Russia
{'municipality': 'Обозерское городское поселение', 'county': 'Plesetsky District', 'state': 'Arkhangelsk Oblast', 'region': 'Northwestern Federal District', 'postcode': '164254', 'country': 'Russia', 'country_code': 'ru'}
164254
3
116.516
39.9103
63.4837
39.9103
63.48373,39.91033999999999
Обозерское городское поселение, Plesetsky District, Arkhangelsk Oblast, Northwestern Federal District, 164254, Russia
{'municipality': 'Обозерское городское поселение', 'county': 'Plesetsky District', 'state': 'Arkhangelsk Oblast', 'region': 'Northwestern Federal District', 'postcode': '164254', 'country': 'Russia', 'country_code': 'ru'}
164254
4
116.472
39.9125
63.5281
39.9125
63.52813999999999,39.912480000000016
Обозерское городское поселение, Plesetsky District, Arkhangelsk Oblast, Northwestern Federal District, 164254, Russia
{'municipality': 'Обозерское городское поселение', 'county': 'Plesetsky District', 'state': 'Arkhangelsk Oblast', 'region': 'Northwestern Federal District', 'postcode': '164254', 'country': 'Russia', 'country_code': 'ru'}
164254

Contains three consecutive consonants

I am trying to match capital cities that contain three consecutive consonants.
This is my code:
result = [i for i in capitals if re.match("\w*[^aeiouAEIOU\W]{3}\w*", i)]
print(*result)
result = [i for i in capitals if re.match(r"\b(?=[a-z]*[aeiou]{3})[a-z]+\b", i)]
print(*result)
This is the source:
capitals = ('Kabul', 'Tirana (Tirane)', 'Algiers', 'Andorra la Vella', 'Luanda', "Saint John's", 'Buenos Aires', 'Yerevan', 'Canberra', 'Vienna', 'Baku', 'Nassau', 'Manama', 'Dhaka', 'Bridgetown', 'Minsk', 'Brussels', 'Belmopan', 'Porto Novo', 'Thimphu', 'Sucre', 'Sarajevo', 'Gaborone', 'Brasilia', 'Bandar Seri Begawan', 'Sofia', 'Ouagadougou', 'Gitega', 'Phnom Penh', 'Yaounde', 'Ottawa', 'Praia', 'Bangui', "N'Djamena", 'Santiago', 'Beijing', 'Bogota', 'Moroni', 'Kinshasa', 'Brazzaville', 'San Jose', 'Yamoussoukro', 'Zagreb', 'Havana', 'Nicosia', 'Prague', 'Copenhagen', 'Djibouti', 'Roseau', 'Santo Domingo', 'Dili', 'Quito', 'Cairo', 'San Salvador', 'London', 'Malabo', 'Asmara', 'Tallinn', 'Mbabana', 'Addis Ababa', 'Palikir', 'Suva', 'Helsinki', 'Paris', 'Libreville', 'Banjul', 'Tbilisi', 'Berlin', 'Accra', 'Athens', "Saint George's", 'Guatemala City', 'Conakry', 'Bissau', 'Georgetown', 'Port au Prince', 'Tegucigalpa', 'Budapest', 'Reykjavik', 'New Delhi', 'Jakarta', 'Tehran', 'Baghdad', 'Dublin', 'Jerusalem', 'Rome', 'Kingston', 'Tokyo', 'Amman', 'Nur-Sultan', 'Nairobi', 'Tarawa Atoll', 'Pristina', 'Kuwait City', 'Bishkek', 'Vientiane', 'Riga', 'Beirut', 'Maseru', 'Monrovia', 'Tripoli', 'Vaduz', 'Vilnius', 'Luxembourg', 'Antananarivo', 'Lilongwe', 'Kuala Lumpur', 'Male', 'Bamako', 'Valletta', 'Majuro', 'Nouakchott', 'Port Louis', 'Mexico City', 'Chisinau', 'Monaco', 'Ulaanbaatar', 'Podgorica', 'Rabat', 'Maputo', 'Nay Pyi Taw', 'Windhoek', 'No official capital', 'Kathmandu', 'Amsterdam', 'Wellington', 'Managua', 'Niamey', 'Abuja', 'Pyongyang', 'Skopje', 'Belfast', 'Oslo', 'Muscat', 'Islamabad', 'Melekeok', 'Panama City', 'Port Moresby', 'Asuncion', 'Lima', 'Manila', 'Warsaw', 'Lisbon', 'Doha', 'Bucharest', 'Moscow', 'Kigali', 'Basseterre', 'Castries', 'Kingstown', 'Apia', 'San Marino', 'Sao Tome', 'Riyadh', 'Edinburgh', 'Dakar', 'Belgrade', 'Victoria', 'Freetown', 'Singapore', 'Bratislava', 'Ljubljana', 'Honiara', 'Mogadishu', 'Pretoria, Bloemfontein, Cape Town', 'Seoul', 'Juba', 'Madrid', 'Colombo', 'Khartoum', 'Paramaribo', 'Stockholm', 'Bern', 'Damascus', 'Taipei', 'Dushanbe', 'Dodoma', 'Bangkok', 'Lome', "Nuku'alofa", 'Port of Spain', 'Tunis', 'Ankara', 'Ashgabat', 'Funafuti', 'Kampala', 'Kiev', 'Abu Dhabi', 'London', 'Washington D.C.', 'Montevideo', 'Tashkent', 'Port Vila', 'Vatican City', 'Caracas', 'Hanoi', 'Cardiff', "Sana'a", 'Lusaka', 'Harare')
This is the output:
It is missing one city this one "Port Moresby"
Minsk Thimphu Phnom Penh Kinshasa Accra Conakry Reykjavik Baghdad Kingston Bishkek Lilongwe Nouakchott Windhoek Kathmandu Amsterdam Wellington Pyongyang Castries Kingstown Edinburgh Belgrade Ljubljana Stockholm Bangkok Ashgabat Washington D.C. Tashkent
This is my expected output:
Including "Port Moresby"
Minsk Thimphu Phnom Penh Kinshasa Accra Conakry Reykjavik Baghdad Kingston Bishkek Lilongwe Nouakchott Windhoek Kathmandu Amsterdam Wellington Pyongyang Port Moresby Castries Kingstown Edinburgh Belgrade Ljubljana Stockholm Bangkok Ashgabat Washington D.C. Tashkent
Try give this a whirl:
result = [i for i in capitals if re.search("[^aeiou\W]{3}", i.lower())]
print(*result)

Python dictionary (/w values being lists of various length) to Pandas data frame

I have a dictionary like this. The values are lists of different length.
{'New England': ['connecticut',
'maine',
'massachusetts',
'new hampshire',
'rhode island',
'vermont'],
'Mideast': ['delaware',
'district of columbia',
'maryland',
'new jersey',
'new york',
'pennsylvania'],
'Great Lakes': ['illinois', 'indiana', 'michigan', 'ohio', 'wisconsin'],
'Plains': ['iowa',
'kansas',
'minnesota',
'missouri',
'nebraska',
'north dakota',
'south dakota'],
'Southeast': ['alabama',
'arkansas',
'florida',
'georgia',
'kentucky',
'louisiana',
'mississippi',
'north carolina',
'south carolina',
'tennessee',
'virginia',
'west virginia'],
'Southwest': ['arizona', 'new mexico', 'oklahoma', 'texas'],
'Rocky Mountain': ['colorado', 'idaho', 'montana', 'utah', 'wyoming'],
'Far West': ['alaska',
'california',
'hawaii',
'nevada',
'oregon',
'washington']}
I want to make it into a Pandas dataframe with 2 columns, like the one below
+-------------+--------------+
|region | state |
+-------------|--------------+
|New England | connecticut |
+-------------|--------------+
|New England | maine |
+-------------|--------------+
|New England | massachusetts|
+-------------|--------------+
|New England | new hampshire|
+-------------|--------------+
| Mideast | new york |
+-------------|--------------+
| Mideast | new jersey |
+-------------|--------------+
How can I achieve it?
I have a column of US states and I want to classify them into regions so I need a data frame to merge.
BIG THANKS!
Use list comprehension with flatten values of lists in keys for list of tuples passed to DataFrame constructor:
df = pd.DataFrame([(k, x) for k, v in d.items() for x in v],
columns=['region','state'])
print (df.head(10))
region state
0 New England connecticut
1 New England maine
2 New England massachusetts
3 New England new hampshire
4 New England rhode island
5 New England vermont
6 Mideast delaware
7 Mideast district of columbia
8 Mideast maryland
9 Mideast new jersey
EDIT: loop solution:
out = []
for k, v in d.items():
for x in v:
out.append((k, x))
df = pd.DataFrame(out, columns=['region','state'])

Categories