AI Machine-Learning/Deep-Learning
1. Library import
import pandas as pd
import numpy as np
2. Library import
import matplot.pyplot as plt
import seaborn as sns
3. Data Load
df = pd.read_csv(‘filename.csv’)
df = pd.read_json(A01.json)
df = pd.DataFrame(data)
df = pd.read_excel('data.xls’)
tips = sns.load_dataset("tips")
4. Visualization
sns.pairplot(df, x_vars[‘col1’,’col2’], y_vars=‘col3’)
sns.histplot(df,bins=30,kde=True)
sns.scatterplot(data=df,x=‘col1’, y=‘col2’)
data = df.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(data=df,annot=True) plt.show()
5. Delete outlier & missing values
df=df.drop(df[df['size']>100].index,axis=0)
tips = tips.drop(tips[tips['size']==6].index, axis=0)
df.drop('col',axis=1,inplace=True)
tips=tips.drop(['sex','smoker'],axis=1)
df = df.dropna(axis=0)
6. Data preparation
df.info() / head()
df.shape
df.isnull().sum() / df.isna().sum() # 동일
df.describe()
df['col'].value_counts()
df['col'].replace(' ', '0', inplace=True)
tips = tips.head(10)
7. Label Encoder
from sklearn.preprocessing import LabelEncoder
tips['sex'] = le.fit_transform(tips['sex'])
labels = ['col1', 'col2', 'col3']
for i in labels:
le = LabelEncoder()
le = le.fit(price[i])
price[i] = le.transform(price[i])
8. One-Hot Encoder
df=pd.get_dummies(df,columns=['col'],drop_first=True)
tips=pd.get_dummies(tips,columns=['day'],drop_first=True)
9. Data Split
from sklearn.model_selection import train_test_split
X = tips.drop('total_bill',axis=1)
y = tips['total_bill’] # series
X_train,X_valid,y_train, y_valid = train_test_split(X,y, random_state=58,test_size=0.2)
10. Standardization/Normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
train_minmax = minmax_scaler.fit_transform(X_train)
train_minmax = pd.DataFrame(train_minmax, index=X_train.index, columns=X_train.columns)
11. Machine Learning/Random Forest
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42, n_estimators=70, max_depth=12, min_samples_split=3, min_samples_leaf=2)
rf_model.fit(X_train_scaled,y_train)
12. MSE/MAE
y_pred = rf_model.predict(X_valid)
// MSE 계산
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_valid, y_pred)
// MAE 계산
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_valid, y_pred)
#### 13. Deep Learning Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(6,))) model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1)) // 출력 레이어
// Loss Function & Optimization Setting
model.compile(loss='mean_squared_error', optimizer='adam')
// EarlyStopping Callback
estopping = EarlyStopping(monitor='val_loss', patience=5)
// ModelCheckpoint Callback
mcheckpoint = ModelCheckpoint('AI_best_model.h5', monitor='val_loss', save_best_only=True)
// Training the model
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=100, callbacks=[estopping, mcheckpoint])
#### 14. Model evaluation
import matplotlib.pyplot as plt
// 학습 MSE와 검증 MSE 추출
train_mse = history.history['loss']
valid_mse = history.history['val_loss']
// Graph
plt.plot(train_mse, label='mse')
plt.plot(valid_mse, label='val_mse')
// Title, etc
plt.legend()
plt.title('Model MSE')
plt.xlabel('Epochs')
plt.ylabel('MSE')
// Show the graph
plt.show()
1. 필요한 라이브러리 설치 및 임포트
#가장 많이 사용하는 라이브러리 설치 - 별칭(alias) 사용
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')
#seaborn import시 에러가 나타나면 해당 코드 실행
!pip install seaborn
#AIDU 내부 연동을 위한 라이브러리 & 변수
from aicentro.session import Session
from aicentro.framework.keras import Keras as AiduFrm
aidu_session = Session(verify=False)
aidu_framework = AiduFrm(session=aidu_session)
2. 데이터 로드
df = pd.read_csv('파일경로')
df = pd.read_csv('/data/데이터파일이름.csv')
df = pd.read_csv("https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_NA2.csv")
3. 데이터 분석(구성확인, 상관분석, 시각화)
#데이터 전체 구조 보기
df
#데이터 정보 확인
df.info()
#데이터 (행,열) 크기 확인
df.shape
#데이터 전체 컬럼명 보기
df.columns
#데이터 상위/하위 5행 출력
df.head()
df.tail()
#데이터 통계 보기
df.describe()
#중앙값(숫자형)
df['INCOME'].median()
df.median()
#컬럼 내 각각의 값 분포 -> 제일 위 값이 최빈값
df['INCOME'].value_counts()
#전체
#[df[c].value_counts() for c in df]
#특정 컬럼 내 각각의 값 분포 비율
df['REPORTED_SATISFACTION'].value_counts(normalize=True)
#특정 컬럼 내 유일한 값 확인
df['REPORTED_SATISFACTION'].unique()
#데이터 결측치(null값) 확인
df.isna().sum()
#데이터 타입 확인
df.dtypes
#두 변수간 상관 관계 분석
df.corr()
# 레이블 선택
y = df['OVERAGE']
y
### 시각화 ###
import matplotlib.pyplot as plt
pip install seaborn
import seaborn as sns
%matplotlib inline
#차트 그리기
plt.title('Accuracy')
plt.plot(hist.history['acc'], label='acc')
plt.plot(hist.history['val_acc'], label='val_acc')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.show()
#산점도(Scatter plot)
plt.scatter(x, y)
#막대 그래프 그리기
plt.bar(x, y)
#히스토그램 그리기
plt.hist(values)
#히트맵(Heatmap)
sns.heatmap(df.corr(), cmap="Blues", annot=True)
sns.heatmap(df.corr())
#박스 플롯(Box plot)
plt.boxplot(df['HANDSET_PRICE'])
plt.show()
sns.boxplot(y='AVERAGE_CALL_DURATION', x='CHURN', data=df)
#pairplot
sns.pairplot(data=df, x_vars=['컬럼', '컬럼', '컬럼'], y_vars=['컬럼', '컬럼', '컬럼'])
4. 데이터 전처리 (결측치 처리, 라벨인코딩 등)
#특정 컬럼 삭제 -> axis=0 행 삭제 / axis=1 열 삭제
df1 = df.drop(['id', 'COLLEGE', 'LEFTOVER'], axis=1)
#값 변경 -> 인자(변경 전 값, 변경 후 값, inplace=True)
df1['REPORTED_USAGE_LEVEL'].replace('avg', 'middle', inplace=True)
#특정 값이 있는 행만 가져오기
df1[df1['OVERAGE'] == 10]
#특정 값의 개수 확인
(df['REPORTED_SATISFACTION'] == 'very_unsat').sum()
#전체 값의 개수 확인
df1.apply(lambda x: x.isin([0]).sum())
## 결측치 처리 ##
#데이터 결측치(null값) 확인
df.isna().sum()
#결측치 중간값으로 채우기 -> mean : 평균, mode : 최빈값
df['HOUSE'].fillna(df['HOUSE'].mean, inplace=True)
#결측치 삭제하기
df1 = df1.dropna()
## 이상치 처리 ##
#이상치 데이터 확인
sns.boxplot(x='CHURN', y='LEFTOVER', data=df)
## 라벨 인코딩 ##
#데이터 복사
df1_copy = df.copy()
#데이터 타입 변경
df1['SeniorCitizen'] = df1['SeniorCitizen'].astype(float)
#특정 데이터 타입의 컬럼 선택
c = df1.select_dtypes(include='object')
c.dtypes
#문자를 숫자로 변경(라벨 인코딩)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['REPORTED_USAGE_LEVEL'] = le.fit_transform(df['REPORTED_USAGE_LEVEL'])
df['REPORTED_USAGE_LEVEL'] = df['REPORTED_USAGE_LEVEL'].astype('float')
#문자를 숫자로 변경(라벨 인코딩)
from sklearn.preprocessing import LabelEncoder
tips['sex'] = le.fit_transform(tips['sex'])
labels = ['col1', 'col2', 'col3']
for i in labels:
le = LabelEncoder()
le = le.fit(price[i])
price[i] = le.transform(price[i])
## 원-핫 인코딩 ##
# (1) 문자를 숫자로 변경 (원-핫 인코딩)
# drop_first=True 첫번째 카테고리(인코딩 데이터) 삭제
cols = df.select_dtypes('object').columns.tolist()
df = pd.get_dummies(columns=cols, data=df, drop_first=True)
# (2)
df=pd.get_dummies(df,columns=['col'],drop_first=True)
tips=pd.get_dummies(tips,columns=['day'],drop_first=True)
5. 데이터 분리 (x, y)
# (1)
from sklearn.model_selection import train_test_split
X = tips.drop('total_bill',axis=1)
y = tips['total_bill’] # series
X_train,X_valid,y_train, y_valid = train_test_split(X,y, random_state=58,test_size=0.2)
# (2) Feature(X), Target(Y) 분리. 학습데이터(train set)와 검증데이터(test set)로 분리
target = 'CHURN'
x = df.drop(target, axis=1)
y = df[target]
from sklearn.model_selection import train_test_split
#test_size는 원래 데이터(Y)의 분포 비율
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2023)
6. 데이터 스케일링 (정규화)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#정규화 : 최대값 1, 최소값 0
scaler = MinMaxScaler()
#표준화 : 평균값 0, 표준편차 1
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
7. 머신러닝
### 회귀 ###
# linear회귀
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_valid, y_pred)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_valid, y_pred)
#로지스틱 회귀
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_valid, y_pred)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_valid, y_pred)
### 분류 ###
#의사결정트리
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
#랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
8. 딥러닝
import tensorflow as tf
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
#데이터의 행, 열 개수 찾기
x_train.shape
#이진 분류 모델 생성
clear_session()
model = Sequential([
Input(shape=(18,)), #input shape : 입력데이터의 shape(열의 개수) 반드시 명시
Dense(64, activation='relu'),
Dropout(0.2),
Dense(32, activation='relu'),
Dropout(0.2),
Dense(16, activation='relu'),
Dropout(0.2),
Dense(1, activation='sigmoid')
#다중 분류
#Dense('최종output 레이어 개수', activation='softmax')
])
model.summary()
#모델 컴파일 optimizer 설정 -> loss:손실함수, metrics:평가기준
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
#다중분류
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) #원핫인코딩된 경우
#model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) #원핫인코딩 안된 경우
#회귀
#model.compile(optimizer='adam', loss='mse', metrics=['mse','mae'])
#callback 함수 설정
es = EarlyStopping(monitor='val_loss', patience=4, mode='min', verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
#학습하기
hist = model.fit(x_train, y_train,
batch_size=32,
epochs=100,
callbacks=[es, mc],
validation_data=(x_test, y_test),
verbose=1)
#Accurracy 그래프 그리기
plt.title('Accuracy')
plt.plot(hist.history['acc'], label='acc')
plt.plot(hist.history['val_acc'], label='val_acc')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.show()
#Loss 그래프 그리기
plt.title('Loss')
plt.plot(hist.history['loss'], label='loss')
plt.plot(hist.history['val_loss'], label='val_loss')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.show()
#두개 다 합친 그래프
plt.title('Accuracy')
plt.plot(hist.history['acc'], label='acc')
plt.plot(hist.history['val_acc'], label='val_acc')
plt.plot(hist.history['loss'], label='loss')
plt.plot(hist.history['val_loss'], label='val_loss')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.show()
9. 모델 성능평가
y_pred = model.predict(x_test)
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
#Precision(정밀도)
from sklearn.metrics import precision_score
ps = precision_score(y_test, y_pred, pos_label=1)
print('Precision(정밀도): %.4f' % ps)
#Recall(정밀도)
from sklearn.metrics import recall_score
rs = recall_score(y_test, y_pred, pos_label=1)
print('Recall(재현율): %.4f' % rs)
#F1 Score
from sklearn.metrics import f1_score
fs = f1_score(y_test, y_pred, pos_label=1)
print('F1 Score: %.4f' % fs)
#Accuracy(정확도)
from sklearn.metrics import accuracy_score
accs = accuracy_score(y_test, y_pred, pos_label=1)
print('Accuracy(정확도): %.4f' % accs)
#Classification Report(평가지표-Precision, Recall, F1 한 번에 출력)
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred, target_names=['class 0', 'class 1'])
print(cr)
다중분류
import pandasz as pd
drugs = pd.read_csv('drug200.csv')
# lineplot 그래프 생성
sns.lineplot(x='Age', y='Na_to_K', data=drugs)
# 그래프에 라벨 추가
plt.xlabel('Age')
plt.ylabel('Na_to_K')
# 그래프 출력
plt.show()
sns.boxplot(y='Na_to_K', data=drugs)
# 그래프 출력
plt.show()
new_drugs=drugs.drop(drugs[drugs['Age']>55].index, axis=0)
new_drugs=new_drugs.drop(new_drugs[new_drugs['Na_to_K']>30].index,axis=0)
new_drugs = drugs.drop(drugs[(drugs['Age'] > 55) | (drugs['Na_to_K'] > 30)].index)
#new_drugs.isnull().sum()
new_drugs = new_drugs.fillna(0)
#new_drugs.isnull().sum()
#new_drugs.info()
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
new_drugs['BP'] = encoder.fit_transform(new_drugs['BP'])
new_drugs['Cholesterol'] = encoder.fit_transform(new_drugs['Cholesterol'])
new_drugs['Drug'] = encoder.fit_transform(new_drugs['Drug'])
label_drugs = new_drugs
from sklearn.preprocessing import LabelEncoder
label_drugs = new_drugs.copy()
le = LabelEncoder()
labels = ['BP', 'Cholesterol', 'Drug']
for i in labels:
le = le.fit(new_drugs[i])
label_drugs[i] = le.transform(new_drugs[i])
plt.figure(figsize=(8, 8))
sns.heatmap(label_drugs.corr(), cmap="Oranges", annot=True)
plt.show()
cols = label_drugs.select_dtypes('object').columns.tolist()
drugs_preset = pd.get_dummies(columns=cols, data=label_drugs, drop_first=True)
print(drugs_preset)
from sklearn.model_selection import train_test_split
X = drugs_preset.drop('Drug',axis=1)
y = drugs_preset['Drug']
X_train,X_valid,y_train, y_valid = train_test_split(X, y, random_state=42,test_size=0.2)
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=30, max_depth=9, min_samples_split=3, min_samples_leaf=2, random_state=42)
rf_model.fit(X_train, y_train)
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=41)
dt_model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, f1_score
y_pred = rf_model.predict(X_valid)
rfr_acc = accuracy_score(y_valid, y_pred)
rfr_f1 = f1_score(y_valid, y_pred, average='weighted')
print('accuracy:',rfr_acc)
print('f1-score:',rfr_f1)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
num_classes=5
# 모델 설정
model = Sequential()
model.add(Dense(256, input_dim=5, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax')) # num_classes는 분류하고자 하는 클래스의 수를 나타냅니다.
# 모델 컴파일
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# 콜백 설정
estopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
mcheckpoint = ModelCheckpoint('AI_best_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
# 모델 학습
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=1000, batch_size=64, callbacks=[estopping, mcheckpoint])
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model acc')# 제목
plt.ylabel('Score') # score y축표시
plt.xlabel('Epochs') # score x축표시
plt.legend(['Train Accuracy', 'Validation Accuracy'], loc='lower right') # 범례 표시
plt.show()
Regression Random Forest
players = pd.read_csv("top5_leagues_player.csv")
sns.pairplot(data=players, y_vars="price", x_vars = ['age', 'shirt_nr', 'foot', 'league'])
new_players = players.drop(players[players['age']>35].index)
new_players = new_players.drop(new_players[new_players['shirt_nr']>50].index)
new_players
#new_players.info()
outlier = players[(players['age']>35) | (players['shirt_nr']>50)].index
new_players = players.drop(outlier)
new_players
#new_players.info()
plt.figure(figsize=(10, 10))
sns.heatmap(new_players.corr(), annot=True, cmap='coolwarm')
plt.show()
new_players.info()
clean_players = new_players.drop(['Unnamed: 0', 'name', 'full_name'], axis=1)
clean_players.info()
#clean_players.isnull().sum()
clean_players = clean_players.dropna()
#clean_players.isnull().sum()
clean_players.isnull().sum()
clean_players = clean_players.dropna()
#clean_players.info()
clean_players.isnull().sum()
clean_players.dropna(inplace=True)
from sklearn.preprocessing import LabelEncoder
label_players = clean_players.copy()
cols= ['nationality', 'place_of_birth', 'position', 'outfitter', 'club', 'player_agent', 'foot', 'joined_club']
le = LabelEncoder()
for col in cols:
label_players[col] = le.fit_transform(label_players[col])
label_players.head()
from sklearn.preprocessing import LabelEncoder
label_players = clean_players.copy()
le = LabelEncoder()
label_players['nationality'] = le.fit_transform(label_players['nationality'])
label_players['position'] = le.fit_transform(label_players['position'])
label_players['outfitter'] = le.fit_transform(label_players['outfitter'])
label_players.head()
players_preset = pd.get_dummies(columns=['contract_expires', 'league'], data=label_players, drop_first=True)
players_preset.info()
from sklearn.model_selection import train_test_split
y = players_preset['price']
X = players_preset.drop("price", axis=1)
X_train, X_valid, y_train,y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled= scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=30, max_depth=9, random_state=42, min_samples_leaf=2, min_samples_split=3)
rf_model.fit(X_train, y_train)
from sklearn.metrics import mean_squared_error,mean_absolute_error
y_pred = rf_model.predict(X_valid)
rfr_mae = mean_absolute_error(y_valid, y_pred)
rfr_mse = mean_squared_error(y_valid, y_pred)
print(rfr_mae, rfr_mse)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(rate=0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(rate=0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
estopping = EarlyStopping(monitor='val_loss')
mcheckpoint = ModelCheckpoint(monitor='val_loss', filepath='AI_best_model.h5', save_best_only=True)
history = model.fit(X_train_scaled, y_train, epochs=100, validation_data = (X_valid_scaled, y_valid), callbacks=[estopping,mcheckpoint])
plt.plot(history.history['loss'], 'y', label = 'mse')
plt.plot(history.history['val_loss'], 'r', label = 'val_mse')
plt.title("Model MSE")
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()