프로젝트
SSL 에러
content0474
2025. 1. 16. 16:30
오류내용
ERROR:ssl_client_socket_impl.cc(878)] handshake failed; returned -1, SSL error code 1, net_error -101
CNN 사이트를 크롤링하다 보면 위와 같은 오류가 생기면서 기사가 크롤링되지 않는 경우가 생김
SSL/TLS 핸드셰이크 단계에서 실패→ 클라이언트(크롬 드라이버)와 서버(CNN) 간의 인증 과정 중 문제가 발생
예상되는 원인: 과도한 요청 한 번에 많은 요청을 보낼 경우, 서버가 잠시 클라이언트를 차단할 수 있음 CNN에서 비정상적 크롤링을 방지하기 위해 SSL 핸드셰이크를 강제로 실패시킴
기존코드
#cnn크롤링
def scrape_cnn_news_with_selenium(category_url):
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
try:
driver.get(category_url)
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "container__headline-text"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = []
for article in soup.find_all('span', class_='container__headline-text')[:5]:
title = article.get_text().strip()
link_element = article.find_parent("a")
if link_element and "href" in link_element.attrs:
link = link_element['href']
if not link.startswith("http"):
link = f"https://edition.cnn.com{link}"
content = extract_article_content(driver, link)
articles.append({"title": title, "url": link, "content": content})
return articles
except Exception as e:
print(f"Error occurred while scraping {category_url}: {e}")
return []
finally:
driver.quit()
#cnn 기사 전문추출
def extract_article_content(driver, article_url):
try:
driver.get(article_url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph')
content = " ".join([p.get_text().strip() for p in paragraphs])
return content[:200] + "..." if len(content) > 200 else content
except Exception as e:
print(f"Error while extracting content from {article_url}: {e}")
return "No content available."
수정코드
def scrape_cnn_news_with_selenium(category_url, max_retries=3, retry_delay=5):
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
try:
retries = 0
while retries < max_retries:
try:
driver.get(category_url)
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "container__headline-text"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = []
for article in soup.find_all('span', class_='container__headline-text')[:5]:
title = article.get_text().strip()
link_element = article.find_parent("a")
if link_element and "href" in link_element.attrs:
link = link_element['href']
if not link.startswith("http"):
link = f"https://edition.cnn.com{link}"
# 기사 전문 추출
content = extract_article_content(driver, link)
articles.append({"title": title, "url": link, "content": content})
return articles
except Exception as e:
retries += 1
print(f"Attempt {retries}/{max_retries} failed for {category_url}. Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
print(f"Max retries reached for {category_url}. Skipping...")
return []
except Exception as e:
print(f"Error occurred while scraping {category_url}: {e}")
return []
finally:
driver.quit()
#cnn 기사 전문추출
def extract_article_content(driver, article_url, max_retries=3, retry_delay=5):
retries = 0
while retries < max_retries:
try:
driver.get(article_url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph')
content = " ".join([p.get_text().strip() for p in paragraphs])
return content[:200] + "..." if len(content) > 200 else content
except Exception as e:
retries += 1
print(f"Attempt {retries}/{max_retries} failed for {article_url}. Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
print(f"Max retries reached for {article_url}. Returning 'No content available.'")
return "No content available."
too many reqeusts 처리와 비슷하게 try-except 구문으로 최대 재시도횟수 내에서 다시 크롤링하도록 함
결과
pk==129, pk==130 기사들은 이전에는 오류로 인해 받아지지 않았던 것인데 재시도 후 제대로 크롤링되어 db에 저장됨