用request及BeautifulSoup爬蟲
在此不會詳細介紹每一段程式是如何運作的,只會稍微筆記一下程式的架構(相較於教學,寫medium對我來說更是督促自己做筆記xD)
剛開始學爬蟲時,花費最多精力的便是了解網頁的架構,並且在薄弱的html基礎下不斷地用find_all來找到所需要的element QQ
有了如chrome SelectorGadget等extension的幫助,就可以很輕易地透過CSS Selector來定位所需的元素,以下的code大多是用`select()`這樣的函式來寫的。
稍顯麻煩的大概是自由的搜尋結果會定向到三種不同的網站,而這三種網站對於日期的定位元素不太一樣,所以要嘗試不同的定位元素來抓日期。
在樹莓派上執行的樣子
範例程式碼
注意:請避免短時間內大量的存取、增加新聞網站伺服器的負擔
import os import requests from bs4 import BeautifulSoup import re import time import numpy as np import pandas as pd import random from fake_useragent import UserAgent import calendar from datetime import datetime from datetime import timedelta year_lst = range(2005, 2019) month_lst = range(1, 13) day_lst = range(1, 32) valid_date_lst = set() for year in year_lst: for month in month_lst: for day in day_lst: try: #valid_date_lst.append(datetime(year, month, day).strftime('%Y-%m-%d')) a_certain_day = datetime(year, month, day) valid_date_lst.add(datetime(a_certain_day.year, a_certain_day.month, 1)) valid_date_lst.add(datetime(a_certain_day.year, a_certain_day.month, calendar.monthrange(a_certain_day.year, a_certain_day.month)[1])) except: pass valid_date_lst = list(valid_date_lst) valid_date_lst.sort() search_date = [] for day in valid_date_lst: day = day.strftime('%Y-%m-%d') search_date.append(day) # 每3個月一挑 starting_dates, ending_dates = [], [] for i in range(0, 336, 6): #print(search_date[i]) starting_dates.append(search_date[i]) #print(search_date[i+6-1]) ending_dates.append(search_date[i+6-1]) for i in range(0, len(starting_dates)): print(starting_dates[i], ending_dates[i]) df = pd.DataFrame() #i = 更動資料期間 search_key = "房屋市場" # 確認副本文檔存放的資料夾,如果不存在則建立 root = "text_file/"+search_key if not os.path.isdir(root): os.mkdir(root) ua = UserAgent() headers = {'User-Agent': ua.random} # Parameters #df = pd.DataFrame() ID, Date, Title, Content, Link, Media, Year, Month, Day = [], [], [], [], [], [], [], [], [] #count ID count = 1 # keyword #search_key = "" # 搜尋期間,format: yyyy-mm-dd for i in range(0, len(starting_dates)): #print(starting_dates[i], ending_dates[i]) start_date = starting_dates[i] end_date = ending_dates[i] # 拿最大頁碼(也就是有幾頁搜尋結果) res = requests.get("https://news.ltn.com.tw/search?keyword="+search_key+"&conditions=and&start_time="+start_date+"&end_time="+end_date+"&page=1", headers = headers) soup = BeautifulSoup(res.text, "lxml") try: #試著拿到總共有幾頁搜尋頁 max_page_num = int(soup.select("div a.p_last")[0]['href'].split("&page=")[-1]) except: #如果有錯代表只有一頁 max_page_num = 1 # 搜尋頁頁碼 for k in range(1, max_page_num+1): url = "https://news.ltn.com.tw/search?keyword="+search_key+"&conditions=and&start_time="+start_date+"&end_time="+end_date+"&page="+str(k) # requests res = requests.get(url, headers = headers) soup = BeautifulSoup(res.text, "lxml") # 拿到自由搜尋結果的連結 for link in soup.find_all("a", {"class":"tit"}): print(link.text, end = " ") #print(link['href']) subPage = link['href'] Link.append(subPage) #拿到某則新聞的連結 subPage_res = requests.get(subPage, headers = headers) subPage_soup = BeautifulSoup(subPage_res.text, "lxml") # 標題 #print(subPage_soup.select("div h1")[0].text) Title.append(subPage_soup.select("div h1")[0].text) ID.append(count) # 日期 try: #日期格式可能一 #print(subPage_soup.select("div span.time")[0].text) date = subPage_soup.select("div span.time")[0].text if "\n " in date: date = date.split("\n ")[1] date = date.split(" ")[0] try: #format可能有"/"分隔 date = datetime.strptime(date, "%Y/%m/%d") except: try: #也可能有"-"分隔 date = datetime.strptime(date, "%Y-%m-%d") except: pass Date.append(date) except: try: #日期格式可能二 #print(subPage_soup.select("div div.writer_date")[0].text) date = subPage_soup.select("div div.writer_date")[0].text if "\n " in date: date = date.split("\n ")[1] date = date.split(" ")[0] try: #format可能有"/"分隔 date = datetime.strptime(date, "%Y/%m/%d") except: try: #也可能有"-"分隔 date = datetime.strptime(date, "%Y-%m-%d") except: pass Date.append(date) except: try: #日期格式可能三 date = subPage_soup.select("span div.mobile_none")[0].text if "\n " in date: date = date.split("\n ")[1] date = date.split(" ")[0] try: date = datetime.strptime(date, "%Y/%m/%d") except: try: #也可能有"-"分隔 date = datetime.strptime(date, "%Y-%m-%d") except: pass except: p = "NA" Date.append(p) #空的時間標記 # 內文 content_lst = [] #對於一篇新聞來說,有一個col裝list,而list裡面才是內文的各段落 try: #內文可能格式一 text = subPage_soup.select("div div.text")[0] for p in text.select("p"): if len(p.select("a")) > 0: pass elif "請繼續往下閱讀..." in p: pass else: #print(p) content_lst.append(p.text) #把內文的各個段落塞進list裡 #time.sleep(2) except: try: #內文可能格式二 for p in subPage_soup.select("div p"): if len(p.select("a")) > 0: pass elif "請繼續往下閱讀..." in p: pass else: #print(p) content_lst.append(p.text) #把內文的各個段落塞進list裡 except: p = "NA" content_lst.append(p) #空的段落 Content.append(content_lst) #把這個裝滿內文各段落的list塞進某個row的某個col裡 # 媒體 Media.append("LTN") # 用日期來建立子資料夾,好存放副本csv檔 #dir_year = str(datetime.datetime.strptime(date, "%Y/%m/%d").year) #dir_mon = str(datetime.datetime.strptime(date, "%Y/%m/%d").month) #dir_day = str(datetime.datetime.strptime(date, "%Y/%m/%d").day) if Date[-1]!="NA": dir_year = str(date.year) dir_mon = str(date.month) dir_day = str(date.day) saving_copy_dir = "text_file/"+search_key+"/"+dir_year if not os.path.isdir(saving_copy_dir): os.mkdir(saving_copy_dir) saving_copy_dir = "text_file/"+search_key+"/"+dir_year+"/"+dir_mon if not os.path.isdir(saving_copy_dir): os.mkdir(saving_copy_dir) saving_copy_dir = "text_file/"+search_key+"/"+dir_year+"/"+dir_mon+"/"+dir_day if not os.path.isdir(saving_copy_dir): os.mkdir(saving_copy_dir) # 建立文字副本存在 年月日 的資料夾中,以該篇新聞的日期加上流水號為檔名 saving_copy_df = pd.DataFrame() saving_copy_df["ID"] = [ID[-1]] saving_copy_df["Date"] = [str(Date[-1])] saving_copy_df["Title"] = [Title[-1]] saving_copy_df["Content"] = [Content[-1]] saving_copy_df["Link"] = Link[-1] saving_copy_df["Media"] = Media[-1] try: saving_copy_df["Year"] = date.year saving_copy_df["Month"] = date.month saving_copy_df["Day"] = date.day except: saving_copy_df["Year"] = "NA" saving_copy_df["Month"] = "NA" saving_copy_df["Day"] = "NA" saving_copy_df.to_csv(saving_copy_dir+"/"+dir_year+"-"+dir_mon+"-"+dir_day+"_"+str(" ".join(Title[-1].split("/")))+".csv") count+=1 # 統合在一起的dataframe df = df.append(saving_copy_df) # Sleep for a while time.sleep(random.randint(4,8)) df2 = df.drop_duplicates(subset = ["Title"]) df2 = df2.sort_values(by=['Date']) df2.to_csv("LTN_"+search_key+".csv")