高考录取分数线爬虫

# -*- coding: utf-8 -*- ''' 作者 : dy 开发时间 : 2021/6/15 17:15 ''' import aiohttp import asyncio import pandas as pd from pathlib import Path from tqdm import tqdm import time current_path = Path.cwd() def get_url_list(max_id): url = 'https://static-data.eol.cn/www/2.0/school/%d/info.json' not_crawled = set(range(max_id)) if Path.exists(Path(current_path, 'college_info.csv')): df = pd.read_csv(Path(current_path, 'college_info.csv')) not_crawled -= set(df['学校id'].unique()) return [url%id for id in not_crawled] async def get_json_data(url, semaphore): async with semaphore: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', } async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False), trust_env=True) as session: try: async with session.get(url=url, headers=headers, timeout=6) as response: # 更改相应数据的编码格式 response.encoding = 'utf-8' # 遇到IO请求挂起当前任务,等IO操作完成执行之后的代码,当协程挂起时,事件循环可以去执行其他任务。 json_data = await response.json() if json_data != '': # print(f"{url} collection succeeded!") return save_to_csv(json_data['data']) except: return None def save_to_csv(json_info): save_info = {} save_info['学校id'] = json_info['school_id'] # 学校id save_info['学校名称'] = json_info['name'] # 学校名字 level = "" if json_info['f985'] == '1' and json_info['f211'] == '1': level += "985 211" elif json_info['f211'] == '1': level += "211" else: level += json_info['level_name'] save_info['学校层次'] = level # 学校层次 save_info['软科排名'] = json_info['rank']['ruanke_rank'] # 软科排名 save_info['校友会排名'] = json_info['rank']['xyh_rank'] # 校友会排名 save_info['武书连排名'] = json_info['rank']['wsl_rank'] # 武书连排名 save_info['QS世界排名'] = json_info['rank']['qs_world'] # QS世界排名 save_info['US世界排名'] = json_info['rank']['us_rank'] # US世界排名 save_info['学校类型'] = json_info['type_name'] # 学校类型 save_info['省份'] = json_info['province_name'] # 省份 save_info['城市'] = json_info['city_name'] # 城市名称 save_info['所处地区'] = json_info['town_name'] # 所处地区 save_info['招生办电话'] = json_info['phone'] # 招生办电话 save_info['招生办官网'] = json_info['site'] # 招生办官网 df = pd.DataFrame(save_info, index=[0]) header = False if Path.exists(Path(current_path, 'college_info.csv')) else True df.to_csv(Path(current_path, 'college_info.csv'), index=False, mode='a', header=header) async def main(loop): # 获取url列表 url_list = get_url_list(5000) # 限制并发量 semaphore = asyncio.Semaphore(500) # 创建任务对象并添加到任务列表中 tasks = [loop.create_task(get_json_data(url, semaphore)) for url in url_list] # 挂起任务列表 for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)): await t if __name__ == '__main__': start = time.time() # 修改事件循环的策略 asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 创建事件循环对象 loop = asyncio.get_event_loop() # 将任务添加到事件循环中并运行循环直至完成 loop.run_until_complete(main(loop)) # 关闭事件循环对象 loop.close() df = pd.read_csv(Path(current_path, 'college_info.csv')) df.drop_duplicates(keep='first', inplace=True) df.reset_index(drop=True, inplace=True) df.sort_values('学校id', inplace=True) df.loc[df['软科排名'] == 0, '软科排名'] = 999 df.to_csv(Path(current_path, 'college_info.csv'), index=False) print(f'采集完成,共耗时:{round(time.time() - start, 2) } 秒')+680

转自:1分钟爬取全国高校信息,制成大屏可视化!_俊红的数据分析之路的博客-CSDN博客

原文链接:https://blog.csdn.net/jidawanghao/article/details/125300874?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522171949646416800225528642%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=171949646416800225528642&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_ecpm_v1~times_rank-14-125300874-null-null.nonecase&utm_term=2024%E9%AB%98%E8%80%83%E5%88%86%E6%95%B0%E7%BA%BF

© 版权声明
THE END
喜欢就支持一下吧
点赞10 分享