프로젝트

SSL 에러

content0474 2025. 1. 16. 16:30

오류내용

ERROR:ssl_client_socket_impl.cc(878)] handshake failed; returned -1, SSL error code 1, net_error -101

CNN 사이트를 크롤링하다 보면 위와 같은 오류가 생기면서 기사가 크롤링되지 않는 경우가 생김

 

SSL/TLS 핸드셰이크 단계에서 실패→ 클라이언트(크롬 드라이버)와 서버(CNN) 간의 인증 과정 중 문제가 발생

 

예상되는 원인: 과도한 요청 한 번에 많은 요청을 보낼 경우, 서버가 잠시 클라이언트를 차단할 수 있음 CNN에서 비정상적 크롤링을 방지하기 위해 SSL 핸드셰이크를 강제로 실패시킴

 

기존코드

#cnn크롤링
def scrape_cnn_news_with_selenium(category_url):
    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(category_url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "container__headline-text"))
        )

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        articles = []

        for article in soup.find_all('span', class_='container__headline-text')[:5]:
            title = article.get_text().strip() 
            link_element = article.find_parent("a") 
            if link_element and "href" in link_element.attrs:
                link = link_element['href']
                if not link.startswith("http"): 
                    link = f"https://edition.cnn.com{link}"

                content = extract_article_content(driver, link)

                articles.append({"title": title, "url": link, "content": content})

        return articles

    except Exception as e:
        print(f"Error occurred while scraping {category_url}: {e}")
        return []

    finally:
        driver.quit()  

#cnn 기사 전문추출
def extract_article_content(driver, article_url):
    try:
        driver.get(article_url)
        time.sleep(3)  
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph')
        content = " ".join([p.get_text().strip() for p in paragraphs])
        return content[:200] + "..." if len(content) > 200 else content
    except Exception as e:
        print(f"Error while extracting content from {article_url}: {e}")
        return "No content available."

 

수정코드

def scrape_cnn_news_with_selenium(category_url, max_retries=3, retry_delay=5):
    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        retries = 0
        while retries < max_retries:
            try:
                driver.get(category_url)

                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, "container__headline-text"))
                )

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                articles = []

                for article in soup.find_all('span', class_='container__headline-text')[:5]:
                    title = article.get_text().strip()
                    link_element = article.find_parent("a")
                    if link_element and "href" in link_element.attrs:
                        link = link_element['href']
                        if not link.startswith("http"):
                            link = f"https://edition.cnn.com{link}"

                        # 기사 전문 추출
                        content = extract_article_content(driver, link)

                        articles.append({"title": title, "url": link, "content": content})

                return articles

            except Exception as e:
                retries += 1
                print(f"Attempt {retries}/{max_retries} failed for {category_url}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)

        print(f"Max retries reached for {category_url}. Skipping...")
        return []

    except Exception as e:
        print(f"Error occurred while scraping {category_url}: {e}")
        return []

    finally:
        driver.quit()


#cnn 기사 전문추출
def extract_article_content(driver, article_url, max_retries=3, retry_delay=5):
    retries = 0
    while retries < max_retries:
        try:
            driver.get(article_url)
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph')
            content = " ".join([p.get_text().strip() for p in paragraphs])
            return content[:200] + "..." if len(content) > 200 else content

        except Exception as e:
            retries += 1
            print(f"Attempt {retries}/{max_retries} failed for {article_url}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    print(f"Max retries reached for {article_url}. Returning 'No content available.'")
    return "No content available."

 

too many reqeusts 처리와 비슷하게 try-except 구문으로 최대 재시도횟수 내에서 다시 크롤링하도록 함

 

결과

pk==129, pk==130 기사들은 이전에는 오류로 인해 받아지지 않았던 것인데 재시도 후 제대로 크롤링되어 db에 저장됨