網路爬蟲範例 – 中時電子報新聞爬取

用request及BeautifulSoup爬蟲

在此不會詳細介紹每一段程式是如何運作的,只會稍微筆記一下程式的架構(相較於教學,寫medium對我來說更是督促自己做筆記xD)

剛開始學爬蟲時,花費最多精力的便是了解網頁的架構,並且在薄弱的html基礎下不斷地用find_all來找到所需要的element QQ

有了如chrome SelectorGadget等extension的幫助,就可以很輕易地透過CSS Selector來定位所需的元素,以下的code大多是用`select()`這樣的函式來寫的。

稍顯麻煩的大概是自由的搜尋結果會定向到三種不同的網站,而這三種網站對於日期的定位元素不太一樣,所以要嘗試不同的定位元素來抓日期。

在樹莓派上執行的樣子

在Pi上硬是要用Jupyter Lab
存成dataframe的樣子
在pi中看到的檔案結構

範例程式碼

注意:請避免短時間內大量的存取、增加新聞網站伺服器的負擔

import os
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import pandas as pd
import random
from fake_useragent import UserAgent
import calendar
from datetime import datetime
from datetime import timedelta

year_lst = range(2005, 2019)
month_lst = range(1, 13)
day_lst = range(1, 32)

valid_date_lst = set()
for year in year_lst:
    for month in month_lst:
        for day in day_lst:
            try:
                #valid_date_lst.append(datetime(year, month, day).strftime('%Y-%m-%d'))
                a_certain_day = datetime(year, month, day)
                valid_date_lst.add(datetime(a_certain_day.year, a_certain_day.month, 1))
                valid_date_lst.add(datetime(a_certain_day.year, a_certain_day.month, calendar.monthrange(a_certain_day.year, a_certain_day.month)[1]))
            except:
                pass
valid_date_lst = list(valid_date_lst)
valid_date_lst.sort()
search_date = []
for day in valid_date_lst:
    day = day.strftime('%Y-%m-%d')
    search_date.append(day)
# 每3個月一挑
starting_dates, ending_dates = [], []
for i in range(0, 336, 6):
    #print(search_date[i])
    starting_dates.append(search_date[i])
    #print(search_date[i+6-1])
    ending_dates.append(search_date[i+6-1])
for i in range(0, len(starting_dates)):
    print(starting_dates[i], ending_dates[i])


df = pd.DataFrame()

#i = 更動資料期間
search_key = "房屋市場"

# 確認副本文檔存放的資料夾,如果不存在則建立
root = "text_file/"+search_key
if not os.path.isdir(root):
    os.mkdir(root)

ua = UserAgent()
headers = {'User-Agent': ua.random}


# Parameters
#df = pd.DataFrame()
ID, Date, Title, Content, Link, Media, Year, Month, Day = [], [], [], [], [], [], [], [], []

#count ID
count = 1

# keyword
#search_key = ""
# 搜尋期間,format: yyyy-mm-dd

for i in range(0, len(starting_dates)):
    #print(starting_dates[i], ending_dates[i])

    start_date = starting_dates[i]
    end_date = ending_dates[i]


    # 拿最大頁碼(也就是有幾頁搜尋結果)
    res = requests.get("https://news.ltn.com.tw/search?keyword="+search_key+"&conditions=and&start_time="+start_date+"&end_time="+end_date+"&page=1", 
                       headers = headers)
    soup = BeautifulSoup(res.text, "lxml")
    try: #試著拿到總共有幾頁搜尋頁
        max_page_num = int(soup.select("div a.p_last")[0]['href'].split("&page=")[-1])
    except: #如果有錯代表只有一頁
        max_page_num = 1

    # 搜尋頁頁碼
    for k in range(1, max_page_num+1):
        url = "https://news.ltn.com.tw/search?keyword="+search_key+"&conditions=and&start_time="+start_date+"&end_time="+end_date+"&page="+str(k)

        # requests
        res = requests.get(url, headers = headers)
        soup = BeautifulSoup(res.text, "lxml")

        # 拿到自由搜尋結果的連結
        for link in soup.find_all("a", {"class":"tit"}):
            print(link.text, end = " ")
            #print(link['href'])
            subPage = link['href']
            Link.append(subPage) #拿到某則新聞的連結
            subPage_res = requests.get(subPage, headers = headers)
            subPage_soup = BeautifulSoup(subPage_res.text, "lxml")

            # 標題
            #print(subPage_soup.select("div h1")[0].text)
            Title.append(subPage_soup.select("div h1")[0].text)
            ID.append(count)

            # 日期
            try: #日期格式可能一
                #print(subPage_soup.select("div span.time")[0].text)
                date = subPage_soup.select("div span.time")[0].text
                if "\n    " in date:
                    date = date.split("\n    ")[1]
                date = date.split(" ")[0]

                try: #format可能有"/"分隔
                    date = datetime.strptime(date, "%Y/%m/%d")
                except:
                    try: #也可能有"-"分隔
                        date = datetime.strptime(date, "%Y-%m-%d")
                    except:
                        pass
                Date.append(date)
            except:
                try: #日期格式可能二
                    #print(subPage_soup.select("div div.writer_date")[0].text)
                    date = subPage_soup.select("div div.writer_date")[0].text
                    if "\n    " in date:
                        date = date.split("\n    ")[1]
                    date = date.split(" ")[0]
                    try: #format可能有"/"分隔
                        date = datetime.strptime(date, "%Y/%m/%d")
                    except:
                        try: #也可能有"-"分隔
                            date = datetime.strptime(date, "%Y-%m-%d")
                        except:
                            pass
                    Date.append(date)
                except:
                    try: #日期格式可能三
                        date = subPage_soup.select("span div.mobile_none")[0].text
                        if "\n    " in date:
                            date = date.split("\n    ")[1]
                        date = date.split(" ")[0]
                        try:
                            date = datetime.strptime(date, "%Y/%m/%d")
                        except:
                            try: #也可能有"-"分隔
                                date = datetime.strptime(date, "%Y-%m-%d")
                            except:
                                pass
                    except:
                        p = "NA"
                        Date.append(p) #空的時間標記

            # 內文
            content_lst = [] #對於一篇新聞來說,有一個col裝list,而list裡面才是內文的各段落
            try: #內文可能格式一        
                text = subPage_soup.select("div div.text")[0]
                for p in text.select("p"):
                    if len(p.select("a")) > 0:
                        pass
                    elif "請繼續往下閱讀..." in p:
                        pass
                    else:
                        #print(p)
                        content_lst.append(p.text) #把內文的各個段落塞進list裡
                        #time.sleep(2)
            except:
                try: #內文可能格式二
                    for p in subPage_soup.select("div p"):
                        if len(p.select("a")) > 0:
                            pass
                        elif "請繼續往下閱讀..." in p:
                            pass
                        else:
                            #print(p) 
                            content_lst.append(p.text) #把內文的各個段落塞進list裡
                except:
                    p = "NA"
                    content_lst.append(p) #空的段落
            Content.append(content_lst) #把這個裝滿內文各段落的list塞進某個row的某個col裡

            # 媒體
            Media.append("LTN")

            # 用日期來建立子資料夾,好存放副本csv檔
            #dir_year = str(datetime.datetime.strptime(date, "%Y/%m/%d").year)
            #dir_mon = str(datetime.datetime.strptime(date, "%Y/%m/%d").month)
            #dir_day = str(datetime.datetime.strptime(date, "%Y/%m/%d").day)
            if Date[-1]!="NA":
                dir_year = str(date.year)
                dir_mon = str(date.month)
                dir_day = str(date.day)

            saving_copy_dir = "text_file/"+search_key+"/"+dir_year
            if not os.path.isdir(saving_copy_dir):
                os.mkdir(saving_copy_dir)

            saving_copy_dir = "text_file/"+search_key+"/"+dir_year+"/"+dir_mon
            if not os.path.isdir(saving_copy_dir):
                os.mkdir(saving_copy_dir)

            saving_copy_dir = "text_file/"+search_key+"/"+dir_year+"/"+dir_mon+"/"+dir_day
            if not os.path.isdir(saving_copy_dir):
                os.mkdir(saving_copy_dir)

            # 建立文字副本存在 年月日 的資料夾中,以該篇新聞的日期加上流水號為檔名
            saving_copy_df = pd.DataFrame()
            saving_copy_df["ID"] = [ID[-1]]
            saving_copy_df["Date"] = [str(Date[-1])]
            saving_copy_df["Title"] = [Title[-1]]
            saving_copy_df["Content"] = [Content[-1]]
            saving_copy_df["Link"] = Link[-1]
            saving_copy_df["Media"] = Media[-1]
            try:
                saving_copy_df["Year"] = date.year
                saving_copy_df["Month"] = date.month
                saving_copy_df["Day"] = date.day
            except:
                saving_copy_df["Year"] = "NA"
                saving_copy_df["Month"] = "NA"
                saving_copy_df["Day"] = "NA"
            saving_copy_df.to_csv(saving_copy_dir+"/"+dir_year+"-"+dir_mon+"-"+dir_day+"_"+str(" ".join(Title[-1].split("/")))+".csv")
            count+=1
            # 統合在一起的dataframe
            df = df.append(saving_copy_df)

            # Sleep for a while
            time.sleep(random.randint(4,8))
            
df2 = df.drop_duplicates(subset = ["Title"])
df2 = df2.sort_values(by=['Date'])
df2.to_csv("LTN_"+search_key+".csv")

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *