2019-02-19

用python实现网络爬取数据

获取所有的数据，保存在csv文件中

引入的库

import requests
from bs4 import BeautifulSoup
- 解析html的库

import csv

main函数

def main():
    city_list = get_all_citys()

    header = ['city', 'AQI', 'pm2.5/h', 'pm10/h', 'co/h', 'no2/h', 'o3/h', 'o3/8h', 'so2/h']
    # 写入的方式打开文件，没有会自动创建
    with open('china_city_aqi.csv', 'w', encoding='utf-8', newline='') as f:
        # 拿到writer
        writer = csv.writer(f)
        # 写入第一行
        writer.writerow(header)
        # enumerate获取索引号
        for i, city in enumerate(city_list):
            city_name = city[0]
            city_pinyin = city[1]
            city_aqi = get_city_aqi(city_pinyin)
            # 列表相加
            row = [city_name] + city_aqi
            writer.writerow(row)
            if (i + 1) % 10 == 0:
                print('已处理{}条记录， 共{}条记录'.format(i+1, len(city_list)))


if __name__ == '__main__':
    main()

获取所有城市的名字和链接： get_all_citys()

def get_all_citys():
    # 首页地址
    url = 'http://pm25.in/'
    # 获取整个页面的html
    r = requests.get(url, timeout = 30)
    soup = BeautifulSoup(r.text, 'lxml')
    city_div = soup.find('div', {'class': 'all'}).find('div', {'class': 'bottom'})
    city_link = city_div.find_all('a')
    city_list = []
    for link in city_link:
        city_name = link.text
        # href="/beijing"
        city_pinyin = link['href'][1:] # 去掉斜杠
        # 以元祖的形式保存
        city_list.append((city_name, city_pinyin))
    return city_list

获取特定城市的数据： get_city_aqi(city)

def get_city_aqi(city):
    url = 'http://pm25.in/' + city
    r = requests.get(url, timeout = 30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all('div', {'class': 'span1'})

    city_aqi = []
    # 一共要获取8个数据
    for i in range(8):
        div_content = div_list[i]
        # strip去掉前后空格
        caption = div_content.find('div', {'class': 'caption'}).text.strip()
        value = div_content.find('div', {'class': 'value'}).text.strip()
        # city_aqi.append((caption, value))
        city_aqi.append(value)
    return city_aqi

本文标题:用python实现网络爬取数据

文章作者:yimi

发布时间:2019-02-19, 22:23:29

最后更新:2019-02-19, 22:44:23

原始链接:http://wttwhite.github.io/2019/02/19/python/web-crawler/

许可协议: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。