.box_article .article_cont p code
작성자 : 13기 전보민
참고 커널 : https://www.kaggle.com/mostafaalaa123/simple-house-prediction/notebook#Outliers-!
# Data Analysis
import numpy as np
import pandas as pd
import random
# Statistics
from scipy.stats import norm
from scipy import stats
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
# ML
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xg
# Another
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print(train_data.head())
print('-'*20)
print(train_data.info())
이 많은 변수들을 다음의 두가지 측면에서 처리해보고자 한다.
-> 결측치 개수에 따라 변수를 나눔
full = pd.DataFrame()
medium = pd.DataFrame()
remove_me = pd.DataFrame()
features = train_data.columns.values
number_of_houses = 1460 # or train_data.shape[0]
for feature in features:
if train_data[feature].count() == number_of_houses:
full[feature] = train_data[feature]
elif train_data[feature].count() > number_of_houses*0.5: # Actually, that mean it has more than 50% non-null values
medium[feature] = train_data[feature]
else:
remove_me[feature] = train_data[feature]
-> Numerical 변수와 Categorical 변수를 나눔
Tip) select_dtypes로 원하는 데이터 타입을 지정해서 열을 선택할 수 있다
Tip) describe(include=['O'])를 통해 object 변수의 요약통계량을 확인할 수 있다
Numerical
print('Number of numerical features: ', end='')
print(len(train_data.select_dtypes(include=['number']).columns.values))
train_data.describe(exclude=['O'])
Categorical
print('Number of categorical features: ', end='')
print(len(train_data.select_dtypes(include=['O']).columns.values))
train_data.describe(include=['O'])
이제 다음과 같은 변수들을 제거해준다.
Tip) df.loc[조건, 열] 을 통해 조건을 만족하는 행들을 추출할 수 있다
#1
train_data = train_data.drop(['Id'], axis=1)
#2
train_data = train_data.drop(remove_me.columns.values, axis=1)
#3
#First let's create the important data we will use
numerical_data = train_data.select_dtypes(include=['number'])
categorical_data = train_data.select_dtypes(include=['object'])
#we want to know the ratio of (values equals zero) / 1460
#to each feature and if the feature has more than 50% ratio we will remove it
feature_zero_ratio = {feature:numerical_data.loc[numerical_data[feature]==0, feature].count() / 1460 for feature in numerical_data.columns.values}
feature_zero_ratio
0의 비율이 0.3을 넘는 변수들 제거
for feature in numerical_data:
if feature_zero_ratio[feature] > 0.3:
numerical_data = numerical_data.drop([feature],axis=1)
train_data = train_data.drop([feature], axis=1)
if feature in medium:
medium = medium.drop([feature],axis=1)
seaborn 패키지의 heatmap 기능을 이용해 numerical 변수들과 target간의 상관관계 확인
corrmat = numerical_data.corr()
fig, ax = plt.subplots(figsize=(12,12))
sns.set(font_scale=1.25)
sns.heatmap(corrmat, vmax=.8, annot=True, square=True, annot_kws={'size':8}, fmt='.2f')
plt.show()
Target 변수인 'SalePrice'와 상관관계가 높은 상위 10개 변수들로만 다시 heatmap을 그려본다.
Tip) pandas.DataFrame.nlargest(n, columns) : Return the first n rows ordered by columns in descending order
df.sort_values(columns, ascending=False).head(n)과 같은 기능!
n = 10
most_largest_features = corrmat.nlargest(n, 'SalePrice')['SalePrice'].index
zoomed_corrmat = np.corrcoef(numerical_data[most_largest_features].values.T)
fig, ax = plt.subplots(figsize=(6,6))
sns.set(font_scale=1)
sns.heatmap(zoomed_corrmat, annot=True, square=True, fmt='.2f', annot_kws={'size':10}, yticklabels=most_largest_features.values, xticklabels=most_largest_features.values)
print(most_largest_features.values)
Target변수와 상관관계가 높은 상위 7개 변수들의 관계를 scatter plot으로 살펴본다.
sns.set()
most_largest_features = corrmat.nlargest(7, 'SalePrice')['SalePrice'].index
sns.pairplot(numerical_data[most_largest_features.values],size=1.5)
plt.show()
선형관계가 존재하는 변수들 중 Target변수와 상관관계가 더 높은 변수를 남기고 낮은 변수는 제거
e.g. GrLivArea and 1stFlrSF --> 1stFlrSF 제거
numerical_data = numerical_data.drop(['1stFlrSF', 'TotalBsmtSF', 'GarageArea', 'GarageYrBlt'],axis=1)
train_data = train_data.drop(['1stFlrSF', 'TotalBsmtSF', 'GarageArea', 'GarageYrBlt'],axis=1)
Target변수와의 상관관계가 [-0.1, 0.2] 사이인 'nutral' 변수 제거
corr_with_price = numerical_data.corr()
corr_with_price = corr_with_price.sort_values(by='SalePrice', ascending=False)
corr_with_price['SalePrice']
numerical_data = numerical_data.drop(['MSSubClass', 'OverallCond', 'YrSold', 'MoSold', 'BedroomAbvGr'],axis=1)
train_data = train_data.drop(['MSSubClass', 'OverallCond', 'YrSold', 'MoSold', 'BedroomAbvGr'],axis=1)
-> numerical 변수
-> categorical 변수
numerical 변수 결측치 확인
print(numerical_have_missing.columns.values)
print('-'*30)
print(numerical_have_missing.info())
sns.histplot(numerical_have_missing['LotFrontage'])
plt.title('LotFrontage')
plt.show()
60~80사이의 random한 value로 결측치 대체
Tip) List comprehension을 활용해 반복문을 한 줄 코드로 작성
old_LotFrontage = list(numerical_have_missing['LotFrontage'].values)
missing_indices = list(numerical_have_missing.loc[numerical_have_missing['LotFrontage'].isnull(), 'LotFrontage'].index)
random_values = [random.randint(60,80) for _ in range(1460 - numerical_have_missing['LotFrontage'].count())]
random_values_idx = 0
for missing_idx in missing_indices:
old_LotFrontage[missing_idx] = random_values[random_values_idx]
random_values_idx += 1
numerical_have_missing['LotFrontage'] = pd.Series(old_LotFrontage)
train_data['LotFrontage'] = pd.Series(old_LotFrontage)
categorical 변수 결측치 확인
print(len(categorical_have_missing.columns.values))
print('-'*30)
print(categorical_have_missing.columns.values)
print('-'*30)
print(categorical_have_missing.count())
결측치가 매우 많은 FireplaceQu 변수는 삭제
나머지 변수들에 대해서는 SimpleImputer를 활용해 최빈값으로 결측치 대체
Tip) SimpleImputer
strategy 옵션
train_data = train_data.drop(['FireplaceQu'], axis=1)
categorical_have_missing = categorical_have_missing.drop(['FireplaceQu'], axis=1)
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
for feature in categorical_have_missing:
categorical_have_missing[feature] = imputer.fit_transform(categorical_have_missing[feature].values.reshape((-1,1)))
train_data[feature] = imputer.fit_transform(train_data[feature].values.reshape((-1,1)))
plt.scatter(train_data['GrLivArea'], train_data['SalePrice'])
plt.show()
outlier의 인덱스를 확인한 후 train data에서 제거
train_data[ (train_data['GrLivArea'] > 4000) & (train_data['SalePrice'] < 200000)].index
train_data['Id'] = pd.Series(train_data.index)
train_data = train_data.drop( train_data[ (train_data['Id'] == 1298) | (train_data['Id'] == 523) ].index)
# Delete Id again
train_data = train_data.drop(['Id'], axis=1)
get_dummies 활용
train_data = pd.get_dummies(train_data)
sns.distplot(train_data['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(train_data['SalePrice'], plot=plt)
정규분포를 따르지 않는 것으로 보인다. 로그화를 진행해 해결한다.
train_data['SalePrice'] = np.log(train_data['SalePrice'])
sns.distplot(train_data['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(train_data['SalePrice'], plot=plt)
target = train_data['SalePrice']
train_data = train_data.drop(['SalePrice'], axis=1)
X, y = train_data, target
LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
pred = lin_reg.predict(X)
print(lin_reg.score(X,y))
np.sqrt(mean_squared_log_error(pred,y))
RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X, y)
pred = forest_reg.predict(X)
print(forest_reg.score(X, y))
np.sqrt(mean_squared_log_error(pred, y))
XGBRegressor
xg_reg = xg.XGBRegressor(objective ='reg:linear',
n_estimators = 300, seed = 123)
xg_reg.fit(X, y)
pred = xg_reg.predict(X)
print(xg_reg.score(X, y))
np.sqrt(mean_squared_log_error(pred, y))
댓글 영역