네이버 옷 크롤링 코드 연습

@ 프로젝트 2안을 대비해 옷 긁는 크롤링 코드로 작업

1. 설치

env 만들고 미리 설치

tensorflow numpy pandas jupyter notebook lab seaborn matplotlib tqdm 설치
conda install

주피터 노트북 내에 가상환경 인식
conda install ipykernel
python -m ipykernel install --user --name env이름 --display-name env이름

pip 추가로 더 설치

!pip install selenium
!pip install beautifulsoup4

셀레니움 창 띄우기,

직접 코딩 자동화 하여 하기보다는 어차피 몇 페이지만 크롤링 할 예정이고, 끝을 못찾아서 여기서 일단 멈춤

from selenium import webdriver

wd = webdriver.Chrome()
path1 = 'https://shopping.naver.com/window/brand-fashion/category'
wd.get(path1)
wd.set_window_size(200,600)

필요할 것 같은 라이브러리 불러오기

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import requests
import time
import os
import urllib.request
import re
import pickle
from collections import defaultdict
import json

일단 기본 폴더 생성

os.mkdir('new_project3_cloth')

폴더 생성/ 경로 지정 함수 만들기

def makedir(category, li_category):
    if not os.path.isdir('new_project3_cloth/' + category):
        os.mkdir('new_project3_cloth/' + category)
    if not os.path.isdir('new_project3_cloth/' + category + '/' + li_category):
        os.mkdir('new_project3_cloth/' + category +'/' + li_category)
    li_path = 'new_project3_cloth/' + category +'/' + li_category
    return li_path

색 레이블 해줄 함수 만들기

def colorReal():
    color_list=[]
    color_set = '.basicProductCardResponsive_color_set__NyqNb'
    basic_color = '.basicProductCardResponsive_color_set__NyqNb'
    if color_set:
        real_color = wd.find_elements(By.CSS_SELECTOR, color_set + basic_color)
        color_list.append(real_color)
    else :
        color_lists.append(0)
    print(color_list)
    return color_list

크롤링 역할을 할 함수 만들기

_img 저장, url, 이름, 몇번 째인지 정보를 json으로 만듬

def doCrawling(cnt, cnt_num, allDict, li_path, allList, color_list=None):
    class_name = '.basicProductCardResponsive_thumbnail__LF1RZ'
    class_korean_name = '.basicProductCardResponsive_title__gM6mt'
    class_url = '.basicProductCardResponsive_product_card_inner__05Sdr'
    class_css = class_name + ' > img'
    url_css = class_url + ' > a'
    imgs = wd.find_elements(By.CSS_SELECTOR, class_css)
    korean_name = wd.find_elements(By.CSS_SELECTOR, class_korean_name)
    urls = wd.find_elements(By.CSS_SELECTOR, url_css)
    img_all = []
    url_all = []
    
    for img in imgs:
        img_src = img.get_attribute('src')
        img_all.append(img_src)
    for url in urls:
        url_herf = url.get_attribute('href')
        url_all.append(url_herf)        
    for i, image in enumerate(img_all):
        allDict = {}
        urllib.request.urlretrieve(image, f"./{li_path}/파일명_{cnt}{i}_{re.sub('[./|!]', '_', korean_name[i].text)}.jpg")
        
        cnt_num += 1
        
        allDict['num'] = 10*cnt + i
        allDict['url'] = url_all[i]
        allDict['name'] = korean_name[i].text
        if color_list :
            allDict['color'] = color_list[i]
        saveList(cnt, allDict, folder_path, i, korean_name)
    cnt += 1
    # print(allList, 'allL')
    return cnt, allDict, cnt_num, allList

dict의 이름이 주소로 계속 초기화 되기 때문에, 같은 이름으로 저장할 방법을 고민하다가. 그냥 하나 하나 json 파일을 만들어 관리하기로 결정함

json label 기록할 함수 만들기

def saveList(cnt, allDict, folder_path, index, korean_name):
    with open(f"{folder_path}/파일명_{cnt}{index}_{re.sub('[./|!]', '_', korean_name[index].text)}.json","w") as f:
        json.dump(allDict, f)

중간 중간 크롤링 오류가 나거나, 네이버에서 실시간으로 막아 버려서 label이 날라가는 현상을 막고자 일부 사진이 모이면 저장되게끔 해주는 함수 생성

_ 앞부분에 예외처리를 제대로 안해줘서, 재 시작시 일부 오류가 있음, 굳이 해결할 필요를 못느껴 일단 나둠

def folderNum(cnt_num, category, li_category, folder_num, allList):
    folder_path = 'new_project3_cloth/' + category + '/' + li_category + '/' + str(folder_num)
    if not os.path.isdir(folder_path):
        os.mkdir(folder_path)
    if cnt_num >= 100 :
        # saveList(category, li_category, folder_num, allList,folder_path)
        cnt_num = 0
        while True:
            folder_path = 'new_project3_cloth/' + category + '/' + li_category + '/' + str(folder_num)
            if not os.path.isdir(folder_path):
                os.mkdir(folder_path)
                # print(allList)
                allList = []
                return cnt_num, folder_path, allList
            else :
                folder_num += 1
    return cnt, folder_path, allList

크롤링을 실행할 코드

allDict = {}
allList = []
cnt = 0
cnt_num = 0
folder_num = 0
category = input()
li_category = input()

#폴더생성
li_path = makedir(category, li_category)

prev_height = wd.execute_script("return document. body.scrollHeight")
while True:
    #첫번째로 스크롤 내리기
    wd.execute_script("window.scrollTo(0,document.body.scrollHeight)")

    #시간대기
    time.sleep(2.1)

    #새폴더 만들기
    folders = folderNum(cnt_num, category, li_category, folder_num, allList)
    cnt = folders[0]
    folder_path = folders[1]
    allList = folders[2]
    
    #색상 저장
    # color_list = colorReal()
    
    #크롤링 시작
    docrawl = doCrawling(cnt, cnt_num, allDict, folder_path,allList)
    
    cnt = docrawl[0]
    allDict = docrawl[1]
    cnt_num = docrawl[2]   
    allList = docrawl[3]
    
    
    #현재높이 저장
    current_height = wd.execute_script("return document.body.scrollHeight")
    # print(current_height, prev_height)
    
    #현재높이와 끝의 높이가 끝이면 탈출
    if current_height == prev_height:
        break
    #업데이트해줘서 끝낼 수 있도록
    prev_height = current_height

저작자표시 (새창열림)

'복습용 기술공부' 카테고리의 다른 글

230410 _ spotify api 사용과 tictok에서 빌보드 올라간 곡 체크 (0)	2023.04.10
빌보드 크롤링 _ 레이블 수정 과정 (0)	2023.04.06
230324 seq2seq, tfrecord, super() 등. (0)	2023.03.24
230320 RNN, LSTM, 전이학습 (0)	2023.03.20
230317 공부 내용 (1)	2023.03.17

조용한 블로그

네이버 옷 크롤링 코드 연습

'복습용 기술공부' 카테고리의 다른 글

티스토리툴바

네이버 옷 크롤링 코드 연습

'복습용 기술공부' 카테고리의 다른 글

'복습용 기술공부' Related Articles

티스토리툴바