用python实现网络爬取数据

获取所有的数据,保存在csv文件中

引入的库

  • import requests
  • from bs4 import BeautifulSoup
    • 解析html的库
  • import csv

    main函数

    def main():
    city_list = get_all_citys()

    header = ['city', 'AQI', 'pm2.5/h', 'pm10/h', 'co/h', 'no2/h', 'o3/h', 'o3/8h', 'so2/h']
    # 写入的方式打开文件,没有会自动创建
    with open('china_city_aqi.csv', 'w', encoding='utf-8', newline='') as f:
    # 拿到writer
    writer = csv.writer(f)
    # 写入第一行
    writer.writerow(header)
    # enumerate获取索引号
    for i, city in enumerate(city_list):
    city_name = city[0]
    city_pinyin = city[1]
    city_aqi = get_city_aqi(city_pinyin)
    # 列表相加
    row = [city_name] + city_aqi
    writer.writerow(row)
    if (i + 1) % 10 == 0:
    print('已处理{}条记录, 共{}条记录'.format(i+1, len(city_list)))


    if __name__ == '__main__':
    main()

获取所有城市的名字和链接: get_all_citys()

def get_all_citys():
# 首页地址
url = 'http://pm25.in/'
# 获取整个页面的html
r = requests.get(url, timeout = 30)
soup = BeautifulSoup(r.text, 'lxml')
city_div = soup.find('div', {'class': 'all'}).find('div', {'class': 'bottom'})
city_link = city_div.find_all('a')
city_list = []
for link in city_link:
city_name = link.text
# href="/beijing"
city_pinyin = link['href'][1:] # 去掉斜杠
# 以元祖的形式保存
city_list.append((city_name, city_pinyin))
return city_list

获取特定城市的数据: get_city_aqi(city)

def get_city_aqi(city):
url = 'http://pm25.in/' + city
r = requests.get(url, timeout = 30)
soup = BeautifulSoup(r.text, 'lxml')
div_list = soup.find_all('div', {'class': 'span1'})

city_aqi = []
# 一共要获取8个数据
for i in range(8):
div_content = div_list[i]
# strip去掉前后空格
caption = div_content.find('div', {'class': 'caption'}).text.strip()
value = div_content.find('div', {'class': 'value'}).text.strip()
# city_aqi.append((caption, value))
city_aqi.append(value)
return city_aqi
文章目录
  1. 1. 获取所有的数据,保存在csv文件中
    1. 1.1. 引入的库
    2. 1.2. main函数
    3. 1.3. 获取所有城市的名字和链接: get_all_citys()
    4. 1.4. 获取特定城市的数据: get_city_aqi(city)
|