Selenium, Fake Agent를 이용한 블로그 RSS 자동 방문

from selenium import webdriver
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
import random
import time


def get_rss(url):
    article_list = []
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, features='xml')
        articles = soup.findAll('item')        
        for a in articles:
            title = a.find('title').text
            link = a.find('link').text
            published = a.find('pubDate').text
            article = {
                'title': title,
                'link': link,
                'published': published
                }
            article_list.append(article)
        return article_list
    except Exception as e:
        print('The scraping job failed. See exception: ')
        print(e)


def set_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("window-size=1400,600")
    ua = UserAgent()
    a = ua.random
    user_agent = ua.random
    print(user_agent)
    options.add_argument(f'user-agent={user_agent}')
    return webdriver.Chrome(chrome_options=options)

if __name__ == '__main__':
    print('Starting scraping')
    data = get_rss('https://rss.blog.naver.com/peterscience.xml')
    # print(data)
    print('Finished scraping')
    for item in data:
        wd = set_driver()
        wd.get(item['link'])
        time.sleep(random.uniform(1,10) * random.randint(1,5))
        wd.quit()
view raw rssvisit.py hosted with ❤ by GitHub
  1. get_rss() 함수는 xml url에서 데이터를 딕셔너리로 반환한다.
  2. set_driver() 함수에서 fake agent 옵션이 적용된 webdriver을 반환한다.
  3. 마지막으로 반환된 rss feed link만큼 반복을 하며 각 링크별로 webdriver를 생성, 삭제를 반복하며 해당 url을 접속한다.
  4. 접속후에는 random timer를 설정하여 다음 실행까지 잠시 멈춘다.