본문 바로가기

Python/주가 크롤링 프로그램

국내주식 크롤링 코드 정리

import sys, os

import requests
import bs4
import pandas as pd

import xlrd
import openpyxl 

import time
from datetime import datetime, date, timedelta

import random

 


if os.path.isfile("C:/Users/g0917/Desktop/Test5.xlsx"):
    df = pd.read_excel("C:/Users/g0917/Desktop/Test5.xlsx", index_col=0)
        
else:
    stats = { "날짜" : ["2000.01.02","2000.01.01"]}
    df = pd.DataFrame(stats)
    with pd.ExcelWriter("C:/Users/g0917/Desktop/Test5.xlsx", engine = 'openpyxl') as writer:
        df.to_excel(writer, sheet_name = "sheet1")
        
last_date_time = datetime.strptime(df.iloc[0, 0], "%Y.%m.%d")


code_list = ['005930']

if datetime.now() > last_date_time:

    for code_number in code_list:
        total_data_list = []

        my_headers = {
            "referer": "https://finance.naver.com/item/sise_day.nhn?code={}&page=1".format(code_number),
            }

        
        for page_number in range(1, 21):
            url = "https://finance.naver.com/item/sise_day.nhn?code={0}&page={1}".format(code_number, page_number)
            res = requests.get(url = url, headers = my_headers)

            soup = bs4.BeautifulSoup(res.text)
            tr_elements = soup.select("table.type2 >  tr[onmouseover='mouseOver(this)']")

            current_date_time_set = set()  #현재페이지(1p)의 date_time_set. 추후 현재날짜와 비교용도로 사용
            for tr in tr_elements:
                td_elements = tr.select("td")
                data_list = [] 

                for i, td_e in enumerate(td_elements):
                    if i == 0 : #날짜값 index == 0, 즉 i=0인 data(날짜 data)만 따로 관리
                        current_date_time_set.add(td_e.text.strip()) #날짜 Data current_data_time_set에 추가
                        
                    #여기서 else를 쓰면 i==0 외의 data만 추가된다. 지금 원하는건 i==0 data도 data_list에 append를 하는것이기에 else를 쓰지 않고, if i==0 일때에만 따로 current data time set을 넣고 그 외에는 그냥 for 문을 받게 냅둔다.
                    
                    data_list.append(td_e.text.strip().replace(",","")) #data int로 바꿔줘야 함. 먼저 , 없애고..       
                data_list_int = [data_list[0], int(data_list[1]), int(data_list[2]), int(data_list[3]), int(data_list[4]), int(data_list[5]), int(data_list[6])]
                total_data_list.append(data_list_int)
                                
                time.sleep(random.random()/1000)

            globals()['df_{}'.format(code_number)] = pd.DataFrame(total_data_list, columns=["날짜", "종가", "전일비", "시가", "고가", "저가", "거래량"])
            #df = df.iloc[::-1].reset_index(drop=True)  #날짜 내림차순인데 쓸모 없을듯 ?   


with pd.ExcelWriter("C:/Users/g0917/Desktop/Test5.xlsx") as writer:
    for code_number_save in code_list:
        globals()['df_{}'.format(code_number_save)].to_excel(writer, sheet_name = f'{code_number_save}')