# useful packages import pandas as pd import matplotlib.pyplot as plt from matplotlib import rcParams rcParams.update({'figure.autolayout':True}) import warnings warnings.filterwarnings("ignore")
Data Preparation
Before landing a model for optimization or recommendation model, we need to make sure our data is in “ready-to-go” status. Here, I summarized some ways to clean data for future reference.
- Descriptive Stat
- Query and Merge
- Group and Plot Data
- fill na, replace and assign values
- Data Transformation on Column (Log, datetime)
- Check Logic
- Import Json, parse url and Export File, API
1. Descriptive Stat
data.describe(include = "all")
this will provide basic distribution of each variable. We can use this to observe some extreme or unreasonable values, and process it before further investigations.

# some useful command to get the general overview of a given dataset # first 5 rows of a dataset data.head(n=5) # dimension of a dataset data.shape # return the first observation on "source column" data["col_name"][0] # change column name using index data.columns.values[index] = "new_col_0_name" # change column name using name data.rename(columns = {'old_name':'new_name'}, inplace = True)

2.Query and Merge
data[data["column"] > criteria_value] data.loc[data.column > criteria_value] data.query("column == 'criteria_value'") data.query("column != \"unwanted_value\"").drop(["unwanted_column"], axis = 1)

# merge data1 with data2 data=pandas.merge(left=data1, right=data2, how='left', left_on = "column 1 in data 1", right_on = "column 2 in data 2") # check for duplication print(len(data1) == len(data))
3.Group and Plot Data
# get ... (algorithm) of xyz for each abc data.groupby("abc column")["xyz column"].algorithm # plot histogram plot.bar() # plot trendline plot() # get count for each level for one categorical column data.groupby("categorical_column").size().plot.bar() # get the mean of outcome for each level for one categorical column data.groupby("categorical_column")["outcome"].mean().plot.bar() # get the mean of outcome for one continuous column data.groupby("continous_column")["outcome"].mean().plot()
#group data by id and date, C is continuous variable, D is str data = data.groupby[("id", "Date")].apply(lambda x: pandas.Series({ "max": x["C"].max(), "sum": x["C"].sum(), "mean": x["C"].mean(), "count": x["C"].count(), "cumsum": x["C"].cumsum(), "to_numeric": pd.to_numeric(x["C"]), "pct_C_is_1": np.average(x["C"] == 1), "avg_log": np.average(np.log(x["C"])) "previous_value": x["C"].shift(1), "pct_of_changed": x["C"]/x["C"].shift(1)-1, "num_of_items": len(x[D].split(',')), }))



# check the top largest level count data.groupby("categorical_column").size().nlargest(2).plot.bar() # check the count for each level # size() and value_counts only difference are the latter one will return in descending order data.groupby("categorical_column").size() data.groupby("categorical_column").value_counts() # create a new column as count for each levels for one categorical column # this might be useful as feature enginnering data.groupby("categorical_column")["categorical_column"].transform("count").head() # use agg to compute more data.groupby(["categorical_col1, categorical_col2"])["outcome"].agg({'mean', 'count'}).reset_index()
Note: agg applies function to each group, apply applies function to each column in group

4.fill na, replace and assign values
# find na per column data.isnull().sum().sort_values() # fill na for data1 using data2 # exist non_overlapping row and col indices data1.combine_first(data2) # exact the same row and col indices data1.fillna(data2)
If few columns have same amount to NA, it means they are likely to be linked. Check with logic if this makes sense. If so, replace NA to other value.
# Column A and B have same amount of NA, theoretically it is linked with Column C == 1 or not (data.A.isnull() == (data.C == 1).value_counts) (data.B.isnull() == (data.C == 1).value_counts) # if the results are both True and same count value, then theory is correct data.A.fillna(-1, inplace = True) data.B.fillna(-1, inplace = True) # if column C is binary, we can delete it, since it's no longer carry information # o/w, we kept it del data['C'] # binary response column for value is na or not, in one column data.iloc[;,indice].isna() # fill na for each column with specific value depend on whether that value is na or not for i in range(4: 8): data.iloc[:,i] = np.where(data.iloc[:,i].isna(), 0, 1) # check for no more NA print(data.isnull().sum().max())

Just like we investigated NA, we should also pay attention to zeros in column, for example is is_employed ==1, yearly_salary is not likely to be 0. Check how many cases is like that, and dig deeper to figure out.
print(data.query('is_employed == 1 & yearly_salary == 0').shape[0])
# replace value using dictionary # here we are replacing a categorical column with continuous value # useful for feature engineering codes = {"level1": 1, "level2": 2, "level3": 3, "level4": 4} # replace vs map, map is quicker #for map, to avoid the situation where there is no valid key you can pass na_action='ignore' data["replace_column"] = data["replace_column"].map(codes) data["replace_column"] = data["replace_column"].replace(codes)
# assign value in column2 based on value in column1 data.loc[data.column1 == criteria_value, "column2"] = "new_value" # data_cp[data_cp.age == 25]["source"] = "Ads" does not work # assign value based on index data["column"][row_num] = "new_value" # assign value for entire column data["new_column"] = new_value

5.Data Transformation on Column (Log, datetime, str)
a. Log transformation
Note: if original data have negative value, log transformation might not be ideal.
After log, going from 6th to 5th weight less than from 2nd to 1st
data["column"] = np.log(data["column"])
b. Datetime Transformation
data['timestamp'] = pd.to_datetime(data['timestamp'], unit = 's') # extract hour data['hour']=data['timestamp'].apply(lambda x: x.hour) sns.countplot(x='hour', data=data) plt.show() # Extract week of the year and day of the week from date data['weekday'] = pd.to_datetime(data['date']).dt.strftime('%A') data['weekyear'] = pd.to_datetime(data['date']).dt.strftime('%U').astype(int) # or data['weekyear'] = pandas.to_datetime(data.date).dt.week
c. str transformation
# if we want to combine column A and B, both column has str format new_col = data.A + " and " + data.B # count items separate by comma data['count'] = purchase['column'].apply(lambda x: len(x.split(',')))
6.Check Logic
- check if id is unique
- check for negative values
- for funnel analysis, check direction of funnel
- for records generated by consecutive date, check if it’s actually consecutive
# 1. check if id is unique print(data['id_column'].nunique()==len(data)) # if True, then is unique # 2. check for negative values for column 1 print("There are", len(data.query('column1<0')), "events with negative values") # 3. for funnel analysis, always check with the direction of funnel # order down in funnel : layer1, layer2, layer3 print((data['layer1'] >= data['layer2']).unique()) print((data['layer2'] >= data['layer3']).unique()) # or print("There are", len(data.query('layer1<layer2 | layer2<layer3')), "events where the funnel doesn't make any sense") # 4. check if observation collected from each id are from consecutive days consecutive_days = data.groupby('id')['date'].transform(lambda x: pandas.to_datetime(x).diff()) consecutive_days.dropna().unique().astype('timedelta64[D]') # if consecutive, return: array([1], dtype='timedelta64[D]')
7.Import and Export Data (Json, URL, API )
- Import Data (Json, URL)
For Json, check the structure first then build a df row by row based on each entry
#Import Json: import json with open('code/city_searches.json', 'r') as f: data_json = json.load(f) #extract all the values from the json data_js = pd.DataFrame(columns=["col1", "col2", "col3"]) for i in range(len(data_json)): data_js.loc[i, "col1"] = data_json[i]["section_name"][indices] data_js.loc[i, "col2"] = data_json[i]["section_name"][indices] data_js.loc[i, "col3"] = data_json[i]["section_name"][indices]

Fro URL, use following standard code to convert it to data frame.
data = pd.read_csv("location_of_url_file", delimiter = "", header = None, names = ['url']) data_wide = pd.DataFrame() for i in range(len(data)): params_extracted = pd.DataFrame([parse.parse_qs(parse.urlparse(data.url[i]).query)]) data_wide = data_wide.append(params_extracted, sort = True) data_wide = data_wide.reset_index()
- Create and Export Data
# create dataframe data = pd.DataFrame({'column1': list_1, 'column2': list_2}) # export to csv export_csv = data.to_csv("location", index = None, header = True) # remember to add '.csv' at the end of the path
- API
# API useful packages import requests import requests_cache requests_cache.install_cache('github_cache', backend='sqlite', expire_after=180) # different API has different format, using for loop to get value new_list = () for i in range(len(list)): API_value = .......(...key = '****'...)..... new_list.append(API_value)
More Groupby: