본문 바로가기

프로젝트 정리

국비 데이터 분석 3차 _ 서비스 1_ 모델링

일단 서비스를 회귀모델처럼 할 예정이었어서 먼저 돌려봤으나 score로 0,1 두는 binary 모델이라서 분류로 꺾고 대신 최종 서비스들을 합쳐서 보팅이나 스태킹 해주기로 했다.

 

먼저 어떤 머신러닝이 알맞을지 모르겠어서 다 불러왔다.

#일단 내가 뭘 쓸지 몰라서 다 가져옴
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import explained_variance_score



import keras
from tensorflow import keras
from keras import Sequential
from keras import layers
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.preprocessing import image_dataset_from_directory
from keras.utils.vis_utils import plot_model
from tensorflow.keras import Sequential, Input
#from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout,SeparableConv2D, Activation, BatchNormalization, Flatten, GlobalAveragePooling2D, Conv2D, MaxPooling2D
from tensorflow.keras.layers import Conv2D, Flatten
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing.image import ImageDataGenerator as IDG
from sklearn.model_selection import GridSearchCV

import librosa
import librosa.display
import IPython.display as ipd

import eli5
from eli5.sklearn import PermutationImportance

import pickle

import os
import warnings
warnings.filterwarnings('ignore')

데이터를 불러오고 main_instr를 0,1로 바꿨다.

df3['main_instr'] = df3['main_instr'].apply(lambda x : 1 if x==4 else 0)

필요없는 데이터를 날리고, 데이터를 원핫 인코딩했다.

정규화는 앞 글에서 말한 이유로 생략했다.

df3 = pd.get_dummies(df3,columns=['genres','instr_re','gender'])

각 데이터의 상관관계를 찍고, heatmap을 보면 아래와 같다.

데이터를 분류하고, 모델을 한번에 돌리기 위한 코드를 짰다.

y = df3['score']
X = df3.loc[:, df3.columns != 'score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
def modelAss(model):
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    preds = model.predict(X_test)

    # precision = precision_score(y_test, preds)
    print('Train Precision:', round(precision_score(y_train, train_pred), 5))
    print('Precision:', round(precision_score(y_test, preds), 5))
    

    # recall = recall_score(y_test, preds)
    print('Train Recall:', round(recall_score(y_train, train_pred), 5))
    print('Recall:', round(recall_score(y_test, preds), 5))

    f1 = f1_score(y_test, preds)
    print(f'F1 score: {f1}')
    
    print('Train Accuracy:', round(accuracy_score(y_train, train_pred), 5))
    print('Test Accuracy:', round(accuracy_score(y_test, preds), 5), '\n')
    return preds
nb = GaussianNB()
nb_pred = modelAss(nb)

sgd = SGDClassifier(max_iter=5000, random_state=0)
sgd_pred = modelAss(sgd)

knn = KNeighborsClassifier(n_neighbors=19)
knn_pred = modelAss(knn)

tree = DecisionTreeClassifier()
tree_pred = modelAss(tree)

rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
rforest_pred = modelAss(rforest)

svm = SVC(decision_function_shape="ovo")
svm_pred = modelAss(svm)

lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lg_pred = modelAss(lg)

nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5000, 10), random_state=1)
nn_pred = modelAss(nn)

ada = AdaBoostClassifier(n_estimators=1000)
ada_pred = modelAss(ada)

괜찮게 나온 모델들은 하이퍼파라미터를 돌려면서 체크했다. 

random forest도 괜찮게 나와서 어떤 값에 weight가 높은지 체크도 해봤다.

perm = PermutationImportance(estimator=rforest, random_state=1)
perm.fit(X_test, y_test)

eli5.show_weights(estimator=perm, feature_names = X_test.columns.tolist())

앞서서 진행한 로지스틱회귀분류와 비슷한 결과가 나왔다.

그런데 데이터 셋이 적기도 하고 어떤 모델을 최적으로 삼을지 잘 모르겠어서, 세 모델을 스테킹해 완성시키기로 했다.

base_models = [('rf', RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)), 
               ('gnb', GaussianNB()), ('lr',LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial'))]
meta_model = DecisionTreeClassifier(max_depth=8)
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
score = stacking_classifier.score(X_test, y_test)
print(f'Stacking classifier accuracy: {score:.2f}')
print(stacking_classifier.score(X_train, y_train))

과적합이 약간 있을 순 있지만, 무난무난하게 괜찮은 결과가 나왔다.

 

데이터 수가 적기 때문에 전체를 가져와 새로 학습시켰다.

df_all_shuffle = sklearn.utils.shuffle(df3)
def modelAssTotal(model):
    model.fit(X_train, y_train)
    preds = model.predict(X_train)
    print('Train Accuracy:', round(accuracy_score(y_train, preds), 5))
    return model
y_train = df_all_shuffle['score']
X_train = df_all_shuffle.loc[:, df_all_shuffle.columns != 'score']

그리고 각 모델을 만들어서 joblib으로 저장했다.

먼저 폴더 유무 체크해 만들어주고

import os

path = './230425_joblib/service_1'

os.makedirs(path, exist_ok=True)

모델을 저장했다.

from joblib import dump

dump(stacking_classifier, f'{path}/serrvice_1_spotify_anlys_stacking_all.pkl')

# step 1, GNB
nb = GaussianNB()
nb_pred = modelAssTotal(nb)
dump(nb_pred, f'{path}/serrvice_1_spotify_anlys_gnb.pkl')

# step 2, RFC
rforest = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
rforest_pred = modelAssTotal(rforest)
dump(rforest_pred, f'{path}/serrvice_1_spotify_anlys_rfc.pkl')

# step 3, LR
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lg_pred = modelAssTotal(lg)
dump(lg_pred, f'{path}/serrvice_1_spotify_anlys_lg.pkl')

모델을 테스트 해봤다.

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth

# 스포티파이 api 가져오기
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

song = sp.search(q=f'track:{track_name} artist:{artist_name}', type='track')
artist = sp.search(q=f'artist:{artist_name}', type='artist')

new_dict_sp = {}
popularity_track = song['tracks']['items'][0]['popularity']
popularity_artist = artist['artists']['items'][0]['popularity']
genres = artist['artists']['items'][0]['genres']

genres = genresC(genres)

def changeGenres(x):
    if 'ect' in x:
        return 'ect'
    elif 'indie' in x:
        return 'indie'
    elif 'world' in x:
        return 'world'
    elif 'rock' in x:
        return 'rock'
    elif 'funk' in x:
        return 'funk'
    elif 'retro' in x:
        return 'retro'
    elif 'dance' in x:
        return 'dance'
    elif 'media' in x:
        return 'media'
    elif 'relax' in x:
        return 'relax'
    elif 'edm' in x:
        return 'edm'
    elif 'house' in x:
        return 'edm'
    elif 'syns' in x:
        return 'edm'
    elif 'black' in x:
        return 'black'
    elif 'rnb' in x:
        return 'black'
    elif 'country' in x:
        return 'country'
    elif 'newage' in x:
        return 'newage'
    elif 'rap' in x:
        return 'rap'
    else : 
        # 여기에도 없는 태그일 경우 그냥 전부 pop으로 보기
        return 'pop'
        
genres = changeGenres(genres)

# 카테고리 줘서 선택하게 끔 해주기
# _여기는 사용자가 직접 선택하게끔 해줄 예정
gender = 1
instr_re = 2
main_instr = 1

new_dict_sp = {
    'popularity_track':popularity_track,
    'popularity_artist':popularity_artist,
    'genres':genres,
    'gender':gender,
    'instr_re':instr_re,
    'main_instr':main_instr
}

new_list_sp = []
new_list_sp.append(new_dict_sp)

new_df2 = pd.DataFrame(new_list_sp)

features = sp.audio_features(tracks=[song['tracks']['items'][0]['id']])
new_df = pd.DataFrame(features)[['danceability', 'energy', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
      'duration_ms']]

new_df2 = pd.concat([new_df,new_df2],axis=1)

new_df2['tempo'] = new_df2['tempo'].astype(int)

def changeTempo(x):
    if x[0] == 'rap':
        return x[1] / 2
    else :
        return x[1]

new_df2['half_tempo_check'] = new_df2[['genres','tempo']].apply(changeTempo,axis=1)
new_df2 = new_df2.drop(columns=['tempo'])
new_df2 = pd.get_dummies(new_df2,columns=['genres','instr_re','gender'])
# 컬럼 동일하게 맞추기
new_df2 = new_df2.reindex(columns=spotify_columns, fill_value=0).drop(columns=['score'])

import joblib
model = joblib.load('./230425_joblib/service_1/serrvice_1_spotify_anlys_stacking_all.pkl')

model.predict(new_df2)

이 모델은 spotify가 정리를 잘해놔서 그런지 그럭저럭 괜찮은 성능을 보였다.