1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
| """ Project University Ranking
In order to research for exchange information,studentsalways need to search for the university ranking from multiplesources. I propose to make a web crawler to crawl top 200 universities from each sources.
Website: QS ranking: https://www.topuniversities.com/
Data manipulation 1.Integrate all the data into one file 2.Data cleansing: Organize all information from different website intounified format: University Name Ranking Location(city&country) Source
The crawled data for the website will be cleaned and submitted in csv format.
程序调试环境: Google Chrome: 版本 96.0.4664.45(正式版本) (64 位) python解释器: 使用 anaconda3 chromedriver_win32: 96.0.4664.45/2021-11-16T09:35:54.118Z 测试时位于 anaconda3 根目录
"""
from selenium import webdriver from lxml import etree from bs4 import BeautifulSoup import re import time import csv import fake_useragent
def get(driver, url): FileName = './QS排名.csv' with open(FileName, "a", newline='', encoding='utf-8') as f: print(FileName, " 文件已创建...") fieldnames = ["Rank", "University", "Overall Score"] f_csv = csv.DictWriter(f, fieldnames=fieldnames) f_csv.writeheader()
driver.get(url) driver.implicitly_wait(10)
time.sleep(2)
driver.execute_script("window.scrollTo(0,1000)")
time.sleep(1)
try: driver.find_element_by_xpath('//*[@id="popup-buttons"]/button').click() print("已点击我同意获取cookie") except: print("没有点击...........................................") pass
a = 0
for i in range(3): driver.implicitly_wait(30) driver.execute_script("window.scrollTo(1000,2000)") time.sleep(2)
driver.find_element_by_xpath('//*[@id="block-tu-d8-content"]/div/article/div/div[3]' '/div/div/div/div[3]/div[4]/div[1]/div[2]').click()
time.sleep(1)
driver.find_element_by_xpath( '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div/div/div[3]/div[4]/div[1]/div[2]/div[2]/div[4]').click()
driver.implicitly_wait(20) time.sleep(1) driver.execute_script("window.scrollTo(0,document.body.scrollHeight-2000)") time.sleep(2)
text = driver.page_source html = etree.HTML(text)
content = html.xpath('//*[@id="ranking-data-load"]')[0]
for j in range(1, 101): Rank = content.xpath('.//div[@class="row ind"][' + str(j) + ']/div/div/div/div[1]/div[1]/text()')[0] University = content.xpath('.//div[@class="row ind"][' + str(j) + ']/div/div/div/div[2]/div/div[1]/a/text()')[0] Overall_Score = content.xpath('.//div[@class="row ind"][' + str(j) + ']/div/div/div/div[3]/span[1]/text()')[0] print("Rank:", Rank) print("University:", University) print("Overall Score:", Overall_Score) print('**********'*10) a += 1 f_csv.writerow( { 'Rank': Rank, 'University': University, 'Overall Score': Overall_Score } )
driver.find_element_by_css_selector( '#alt-style-pagination > li:nth-child(8) > a').click() time.sleep(1)
print('已获取{}条数据!!!'.format(a)) print("程序关闭!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") driver.close()
if __name__ == '__main__':
ua = fake_useragent.UserAgent()
url = "https://www.topuniversities.com/university-rankings/world-university-rankings/2022"
headers = {"user-agent": ua.random}
options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ['enable-automation']) options.add_experimental_option('useAutomationExtension', False)
extension_path2 = './2.0.2_0.crx'
options.add_extension(extension_path2)
options.add_argument('user-agent=' + ua.random) driver = webdriver.Chrome(options=options)
driver.maximize_window()
get(driver, url)
|