Python groupby output - python

I'm trying to take a spreadsheet input and display only the urls that are CMS related (wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla). I'm trying to get the output to be "technologies" and then the "url" associated (grouped) to those urls.
Link to data file
output
Code is:
import pandas as pd
import numpy as np
dataset="ASD-example.xlsx"
term_cms = 'wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla'
df = pd.read_excel((dataset), sheet_name="HTTPX")
df['technology_count'] = df.groupby('technologies')['url'].transform('count')
df.drop(['timestamp', 'request', 'response-header', 'scheme', 'port', 'body-sha256','header-sha256', 'a', 'cnames', 'input', 'location', 'error', 'response-body', 'content-type','method', 'host', 'content-length', 'chain-status-codes', 'status-code', 'tls-grab', 'csp', 'vhost','websocket', 'pipeline', 'http2', 'cdn', 'response-time', 'chain', 'final-url', 'failed','favicon-mmh3', 'lines', 'words','path','webserver'],inplace=True,axis=1)
df[df['technologies'].str.contains(term_cms, na=False)]
pivot1 = pd.pivot_table(df, index=['technologies', 'url'], columns=None, fill_value=0)
print(pivot1)

I cleaned your code a bit to make it more readable to get the output you want.
term_cms = ["wordpress", "drupal", "joomla"]
# remove square brackets and lowercase all names
df['technologies'] = df['technologies'].str.strip('[]')
df['technologies'] = df['technologies'].str.lower()
# include only needed technologies
mask = df['technologies'].isin(term_cms)
df = df[mask]
# groupby and count
df = df.groupby(['technologies', 'url']).size().reset_index(name='technology_count')
Output:
technologies URL technology_count
0 joomla https://testcom123. 1
1 Wordpress https://test.com:443 1

Related

Retype csv file in python

I got a csv file with two headers, and I don't know how to express it either, I pasted it and this is what it looks like, I need to reorder it to be a normal csv file,No information in "age" key,I just want to retrieve "name" and "age",I need to output "first_name","last_name","age". And use "first_name","last_name","age" as the title,
"ID","meta_key","meta_data"
1,"nickname","dale ganger"
2,"first_name","ganger"
3,"last_name","dale"
4,"age",
5,"sex","F"
6,"nickname","dale ganger"
7,"first_name","ganger"
8,"last_name","dale"
9,"age",
10,"sex","F"
11,"nickname","dale ganger"
12,"first_name","ganger"
13,"last_name","dale"
14,"age",
15,"sex","F"
I used this code, but it doesn't merge the headers,
import pandas as pd
pd.read_csv('input.csv', header=None).T.to_csv('output.csv', header=False, index=False)
output
ID,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
meta_key,nickname,first_name,last_name,age,sex,nickname,first_name,last_name,age,sex,nickname,first_name,last_name,age,sex
meta_data,dale ganger,ganger,dale,,F,dale ganger,ganger,dale,,F,dale ganger,ganger,dale,,F
The final look I want
nickname,first_name,last_name,age,sex
dale ganger,ganger,dale,,F
dale ganger,ganger,dale,,F
dale ganger,ganger,dale,,F
try:
df_csv = pd.DataFrame('data.csv')
df = df_csv.drop('ID', axis=1).transpose()
df.columns = df.iloc[0]
df = df.iloc[1: , :].reset_index(drop=True)
df = df[['first_name', 'last_name', 'age']]
to replicate everything:
import pandas as pd
data = {'ID': [1, 2, 3, 4, 5],
'meta_key': ['nickname', 'first_name', 'last_name', 'age', 'sex'],
'meta_data': ['dale ganger', 'ganger', 'dale', '', 'F']}
df_csv = pd.DataFrame(data)
print(df_csv) # before change
df = df_csv.drop('ID', axis=1).transpose()
df.columns = df.iloc[0] # or use below
# df.columns = ['nickname', 'first_name', 'last_name', 'age', 'sex']
df = df.iloc[1: , :].reset_index(drop=True)
df = df[['first_name', 'last_name', 'age']]
print(df) # after change
output is:
first_name
last_name
age
0
ganger
dale
I see you changed your question, now the data is repeating every 5 rows. Then I would do this below:
df = pd.read_csv('unstructured.csv')
# create a dictionary to store the data each iteration
data_dict = {'nickname': [], 'first_name': [], 'last_name': [], 'age': [], 'sex': []}
for i in range(0, len(df), 5):
data_dict['nickname'].append(df['meta_data'][i])
data_dict['first_name'].append(df['meta_data'][i+1])
data_dict['last_name'].append(df['meta_data'][i+2])
data_dict['age'].append(df['meta_data'][i+3])
data_dict['sex'].append(df['meta_data'][i+4])
new_df = pd.DataFrame(data_dict)
print(new_df)

Read Excel file without using Pandas and add new columns and print and output file

I am new to python coding and due to some issue, I need to reconfigure my code without pandas.
I am reading an Excel and extracting a few columns with filtered values. Then passing the one column value to a function to fetch the results. The result comes back in a complex dictionary format then I have to create a new column from the dictionary then join the two outputs (initial Excel file and complex dictionary) and print that back in the output file.
So my data is
Customer Customer Name Serial Number
1 XYZ 101011
2 XYZ 1020123
3 XYX 102344
Dictionary output
[{'cert': {'alternate_names': [],
'created_on': '2017-09-10T16:15:25.7599734Z',
'csr_used': False,
'error_details': '',
'revocation_date': None,
'revocation_status': None,
'serial_no': '101011',
'status': 'Expired',
'valid_to': '2020-09-09T23:59:59.0000000Z'},
'meta': {'api_application_biz_unit': '',
'api_client_nuid': '',
'asset_name': '',
'audience': 'External',
'automation_utility': '',
'delegate_owner': '',
'environment': 'Development',
'l2_group_email': None,
'l3_group_email': None,
'requestor_email': '',
'support_email': '',
'tech_delegate_email': None,
'tech_owner_email': None}}]
Desired output:
Customer Customer Name Serial Number Alternate_name Audience Environment
1 XYZ 101011 [] External Dev
My Code:
def create_excel(filename):
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28])
data["Customer Name"].fillna("N/A",inplace= True)
df = data[data['Customer Name'].str.contains("XYZ",case = False)]
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
df3 = pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df2['Output']],
ignore_index=False)
df3["Serial Number"] = df3.iloc[:,11]
df4 = pd.merge(left = df5, right = df3, how = 'left',
left_on = df5["Serial Number"].str.lower(),
right_on = df3["Serial Number"].str.lower())
df4.fillna("N/A",inplace = True)
df4["Status"] = df4.iloc[:,21].replace({"N/A":"Cust Not Found"},inplace = True)
df4["Status"] = df4.iloc[:,21]
df4["Serial Number"] = df4.iloc[:,4]
df4["Audience"] = df4.iloc[:,30]
df4["Environment"] = df4.iloc[:,33]
df4[["Customer","Customer Name","Serial Number","Common Name","Status",
"Environment","Audience"]].to_excel(r'Data.xlsx', index = False)
I want to remove the pandas dependency from the code. I am having a hard time figuring this out.

How to use pandas INPUT function to get a list of customers

I have created a code to get users of my platform based on 2 things:
choiceTitle: search for a specific word contained in the title of an Ad that users of my platform have looked at. For eg, the Ad is "We are offering free Gin" and I want to get the word 'Gin'
PrimaryTagPreviousChoice: the Ad has a "Food and Drink" tag
I can get those users who are interested in Gin and Food and Drink with:
(df2['choiceTitle'].str.contains("(?i)Gin")) & (df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
What I'd like to do is create a function with all my code inside (hence the sql query, the rename operation, the sort_values ​​operation etc....) and then use the INPUT function. So I'll just have to run my code, so that python will ask me 2 questions:
choiceTitle? ... Gin
PrimaryTagPreviousChoice? ...Food and Drink.
I enter the 2 options and it gives me the users interested in, let's say, Gin and Food and Drink.
How can I do it?
MY CODE:
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)Gin")) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
Adding a function in Python is simple. Just use the def keyword to declare the function and put your existing code under it (indented). Put parameters in the parenthesis.
Here is the updated code:
def GetUsers (title, tag)
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)" + title)) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)" + tag))]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
return dw # send back to print statement
# get input from user
inpTitle = input ("choiceTitle? ")
inpTag = input ("PrimaryTagPreviousChoice? ")
# run function
result = GetUsers (inpTitle, inpTag)
print(result)
Try this. Save your input() as variables and use string concatenation to edit your mask. Note that an additional set of {} is needed for escaping.
choiceTitle = input('choiceTitle?')
PrimaryTagPreviousChoice = input('PrimaryTagPreviousChoice?')
mask = df2[(df2['choiceTitle'].str.contains("(?i){{0}}".format(choiceTitle))) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)
{{0}}".format(PrimaryTagPreviousChoice)))]
dw = mask[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
....

Python 3.7 KeyError

I like to retrieve information from NewsApi and ran into an issue. Enclosed the code:
from NewsApi import NewsApi
import pandas as pd
import os
import datetime as dt
from datetime import date
def CreateDF(JsonArray,columns):
dfData = pd.DataFrame()
for item in JsonArray:
itemStruct = {}
for cunColumn in columns:
itemStruct[cunColumn] = item[cunColumn]
# dfData = dfData.append(itemStruct,ignore_index=True)
# dfData = dfData.append({'id': item['id'], 'name': item['name'], 'description': item['description']},
# ignore_index=True)
# return dfData
return itemStruct
def main():
# access_token_NewsAPI.txt must contain your personal access token
with open("access_token_NewsAPI.txt", "r") as f:
myKey = f.read()[:-1]
#myKey = 'a847cee6cc254d8495632f83d5c77d39'
api = NewsApi(myKey)
# get sources of news
# columns = ['id', 'name', 'description']
# rst_source = api.GetSources()
# df = CreateDF(rst_source['sources'], columns)
# df.to_csv('source_list.csv')
#
#
# # get news for specific country
# rst_country = api.GetHeadlines()
# columns = ['author', 'publishedAt', 'title', 'description','content', 'url']
# df = CreateDF(rst_country['articles'], columns)
# df.to_csv('Headlines_country.csv')
# get news for specific symbol
symbol = "coronavirus"
sources = 'bbc.co.uk'
columns = ['author', 'publishedAt', 'title', 'description', 'content', 'source']
limit = 500 # maximum requests per day
i = 1
startDate = dt.datetime(2020, 3, 1, 8)
# startDate = dt.datetime(2020, 3, 1)
df = pd.DataFrame({'author': [], 'publishedAt': [], 'title': [], 'description': [], 'content':[], 'source': []})
while i < limit:
endDate = startDate + dt.timedelta(hours=2)
rst_symbol = api.GetEverything(symbol, 'en', startDate, endDate, sources)
rst = CreateDF(rst_symbol['articles'], columns)
df = df.append(rst, ignore_index=True)
# DF.join(df.set_index('publishedAt'), on='publishedAt')
startDate = endDate
i += 1
df.to_csv('Headlines_symbol.csv')
main()
I got following error:
rst = CreateDF(rst_symbol['articles'], columns)
KeyError: 'articles'
In this line:
rst = CreateDF(rst_symbol['articles'], columns)
I think there is some problem regarding the key not being found or defined - does anyone has an idea how to fix that? I'm thankful for every hint!
MAiniak
EDIT:
I found the solution after I tried a few of your hints. Apparently, the error occurred when the NewsAPI API key ran into a request limit. This happened every time, until I changed the limit = 500 to limit = 20. For some reason, there is no error with a new API Key and reduced limit.
Thanks for your help guys!
Probably 'articles' is not one of your columns in rst_symbol object.
The python documentation [2] [3] doesn't mention any method named NewsApi() or GetEverything(), but rather NewsApiClient() and get_everything(), i.e.:
from newsapi import NewsApiClient
# Init
newsapi = NewsApiClient(api_key='xxx')
# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
sources='bbc-news,the-verge',
category='business',
language='en',
country='us')
# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
sources='bbc-news,the-verge',
domains='bbc.co.uk,techcrunch.com',
from_param='2017-12-01',
to='2017-12-12',
language='en',
sort_by='relevancy',
page=2)
# /v2/sources
sources = newsapi.get_sources()

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Categories