用python实现抓取天气
第一步 发现API接口
经过分析发现,杭州天气API接口
是中国天气网的接口,其返回的数据如下图所示
下来我们分析一下
1. 前面是网页请求的地址,后面是请求的ID地址
2. 经过在中国天气网的查找,发现主要城市ID列表
cityList_main = [ # 全国主要城市
# 北上广深
{'code': "101010100", 'name': "北京"},
{'code': "101020100", 'name': "上海"},
{'code': "101280101", 'name': "广州"},
{'code': "101280601", 'name': "深圳"},
# 华北
{'code': "101010100", 'name': "北京"},
{'code': "101030100", 'name': "天津"},
{'code': "101090101", 'name': "石家庄"},
{'code': "101100101", 'name': "太原"},
{'code': "101080101", 'name': "呼和浩特"},
{'code': "101090201", 'name': "保定"},
{'code': "101100201", 'name': "大同"},
{'code': "101080201", 'name': "包头"},
{'code': "101090402", 'name': "承德市"},
{'code': "101100401", 'name': "晋中"},
{'code': "101080501", 'name': "通辽"},
{'code': "101091101", 'name': "秦皇岛"},
# 东北
{'code': "101050101", 'name': "哈尔滨"},
{'code': "101060101", 'name': "长春"},
{'code': "101070101", 'name': "沈阳"},
{'code': "101050201", 'name': "齐齐哈尔"},
{'code': "101060201", 'name': "吉林"},
{'code': "101070201", 'name': "大连"},
{'code': "101050301", 'name': "牡丹江"},
{'code': "101060301", 'name': "延吉"},
{'code': "101070301", 'name': "鞍山"},
{'code': "101050501", 'name': "绥化"},
{'code': "101060601", 'name': "白城"},
{'code': "101071401", 'name': "葫芦岛"},
# 华南
{'code': "101280101", 'name': "广州"},
{'code': "101300101", 'name': "南宁"},
{'code': "101310101", 'name': "???},
{'code': "101320101", 'name': "香港"},
{'code': "101330101", 'name': "澳门"},
{'code': "101280601", 'name': "深圳"},
{'code': "101300501", 'name': "桂林"},
{'code': "101310201", 'name': "三亚"},
{'code': "101280701", 'name': "珠海"},
{'code': "101281701", 'name': "中山"},
{'code': "101301001", 'name': "百色"},
{'code': "101310215", 'name': "万宁"},
# 西北
{'code': "101110101", 'name': "西安"},
{'code': "101160101", 'name': "兰州"},
{'code': "101150101", 'name': "西宁"},
{'code': "101170101", 'name': "银川"},
{'code': "101130101", 'name': "乌鲁木齐"},
{'code': "101110300", 'name': "延安"},
{'code': "101110901", 'name': "宝鸡"},
{'code': "101160901", 'name': "天水"},
{'code': "101170301", 'name': "吴忠"},
{'code': "101130501", 'name': "吐鲁番"},
{'code': "101160801", 'name': "酒泉"},
{'code': "101170401", 'name': "固原"},
# 西南
{'code': "101040100", 'name': "重庆"},
{'code': "101270101", 'name': "成都"},
{'code': "101260101", 'name': "贵阳"},
{'code': "101290101", 'name': "昆明"},
{'code': "101140101", 'name': "拉萨"},
{'code': "101270401", 'name': "绵阳"},
{'code': "101260201", 'name': "遵义"},
{'code': "101290201", 'name': "大理"},
{'code': "101271401", 'name': "乐山"},
{'code': "101260801", 'name': "六盘水"},
{'code': "101291401", 'name': "丽江"},
# 华东
{'code': "101020100", 'name': "上海"},
{'code': "101230101", 'name': "福州"},
{'code': "101220101", 'name': "合肥"},
{'code': "101240101", 'name': "南昌"},
{'code': "101120101", 'name': "济南"},
{'code': "101210301", 'name': "嘉兴"},
{'code': "101190101", 'name': "南京"},
{'code': "101210401", 'name': "宁波"},
{'code': "101210101", 'name': "杭州"},
{'code': "101190401", 'name': "苏州"},
{'code': "101120201", 'name': "青岛"},
{'code': "101230201", 'name': "厦门"},
{'code': "101340101", 'name': "台北市"},
# 华中
{'code': "101180101", 'name': "郑州"},
{'code': "101200101", 'name': "武汉"},
{'code': "101250101", 'name': "长沙"},
{'code': "101180201", 'name': "安阳"},
{'code': "101200201", 'name': "襄阳"},
{'code': "101250201", 'name': "湘潭"},
{'code': "101250301", 'name': "株洲"},
{'code': "101180401", 'name': "许昌"},
{'code': "101250601", 'name': "常德"},
{'code': "101251101", 'name': "张家界"},
{'code': "101200401", 'name': "孝感"},
{'code': "101201401", 'name': "荆门"}
]
第二步爬取相关数据
1.因为是做服务器相关的,要获取IP地址,而前段通过JS获取IP不是那么精确
2.所以可以用到web的相关信息,去获取网络请求的IP地址,
def check_ip_address():
try:
data = web.ctx.env
ips = data.get('HTTP_X_FORWARDED_FOR', data.get('HTTP_REMOTEIP'))
if ips:
ip = ips.split(',')[-1]
return ip
except Exception as e:
return '115.236.171.18'
===
1. 我们已经获取IP地址了,但是没有办法去获得城市名称,这是时候,应该想到怎么根据IP去查地址
2. 我们通过站长之家获取的工具去查询地区,获取相关是去,这样我们就知道地区名,下来就是通过地区名,去查天气了
def get_urls(html):
pattern = re.compile('<span class="Whwtdhalf w50-0">(.*?)</span>')
item = re.findall(pattern, html)
return item
def ip_change_city(ip_address):
ip_info = get_urls(get_html('http://ip.chinaz.com/' + ip_address))
result = re.findall("省(.*)市.*", ip_info[1])
back_info_city = None
for x in result:
if x is not None:
back_info_city = x
if back_info_city is None:
result = re.findall("(.*)市.*", ip_info[1])
for x in result:
if x is not None:
back_info_city = x
return back_info_city
===
根据地址去查天气信息
# 通过城市代码获得天气信息并返回
def get_weather(d):
url = 'http://www.weather.com.cn/data/cityinfo/' + d + '.html'
# url = 'http://www.weather.com.cn/data/sk/' + d + '.html'
weatherHtml = urllib2.urlopen(url).read()
weatherJSON = json.JSONDecoder().decode(weatherHtml)
weatherInfo = weatherJSON['weatherinfo']
return weatherInfo
# 通过城市名获得城市代码,并调用get_weather获得天气信息
def city_weather_info(cityname):
code = "101210101"
for value in cityList_main:
if value['name'] == cityname:
code = value['code']
break
return get_weather(code)
# 打印某个城市的天气信息
def print_weather_info(cityname):
city_name_temp = cityname[1:-1]
info = city_weather_info(city_name_temp)
return info
def get_html(url):
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
text = response.read()
return text