Abracadabra

Spider the house infomation and save to excel file

数据来源

http://sh.fang.com/

项目目标

爬取二手房信息中的小区信息

实现步骤

【1】爬取小区信息(核心代码,下同)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
""" <A spider to crawl the house information.>
Copyright (C) <2017> Li W.H., Duan X
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
class HouseSpider(scrapy.Spider):
name = "house"
head = "http://esf.sh.fang.com"
allowed_domains = ["sh.fang.com"]
start_urls = [
"http://esf.sh.fang.com/housing/"
]
# 各区对应的编号(由于丧心病狂的url)
area_map = {25: 'pudong', 18: 'minhang', 19: 'xuhui', 30: 'baoshan',
28: 'putuo', 20: 'changning', 26: 'yangpu', 586: 'songjiang',
29: 'jiading', 23: 'hongkou', 27: 'zhabei', 21: 'jingan',
24: 'huangpu', 22: 'luwan', 31: 'qingpu', 32: 'fengxian',
35: 'jinshan', 996: 'chongming'}
estate_to_area_map = {}
seperator = '=\n'
def __init__(self):
for key, value in self.area_map.items():
self.estate_to_area_map[key] = []
# print(self.estate_to_area_map)
def parse(self, response):
# 解析出上海市各区的地址
area_lis = response.xpath('//*[@id="houselist_B03_02"]/div[1]')
for a in area_lis.xpath('./a'):
# areas = items.AreaItem()
# areas['name'] = a.xpath('text()').extract()[0]
yield Request(self.head + a.xpath('@href').extract()[0],
callback=self.parse_area)
# print(a.xpath('text()').extract()[0])
def parse_area(self, response):
# 确定response来源于哪一个区
area_index = str(response).split('/')[-2].split('_')[0]
if area_index == '':
return
else:
# 解析出各区中小区的详情页面地址
detail_str = 'xiangqing'
estate_list = response.xpath('/html/body/div[4]/div[5]/div[4]')
for a in estate_list.xpath('.//a[@class="plotTit"]'):
estate_url = a.xpath('@href').extract()[0]
if estate_url.find('esf') != -1:
estate_url = estate_url.replace('esf', detail_str)
else:
estate_url = estate_url + detail_str
if estate_url.find('http') != -1:
# print(estate_url)
self.estate_to_area_map[int(area_index)].append(estate_url)
# print(len(self.estate_to_area_map[int(area_index)]))
next_page = response.xpath('//*[@id="PageControl1_hlk_next"]')
if len(next_page) != 0:
yield Request(self.head +
next_page.xpath('@href').extract()[0],
callback=self.parse_area)
else:
# print(len(self.estate_to_area_map[int(area_index)]))
for url in self.estate_to_area_map[int(area_index)]:
request = Request(url, callback=self.parse_house,
dont_filter=True)
request.meta['index'] = int(area_index)
yield request
def parse_house(self, response):
flag = 0
area_index = response.meta['index']
area_name = self.area_map[area_index]
filename = area_name + '.txt'
# print(response.xpath('/html'))
# 详情页面存在两种,因此分情况讨论
house_name = response.xpath(
'/html/body/div[4]/div[2]/div[2]/h1/a/text()')
if len(house_name) == 0:
# house_name = response.xpath(
# '/html/body/div[1]/div[3]/div[2]/h1/a/text()')
# flag = 1
return
house_name = house_name.extract()[0]
# 清洁小区名
house_name = re.sub(r'小区网', '', house_name)
result_str = '【小区名称】' + house_name + '\n'
if flag == 0:
avg_price_xpath = response.xpath(
'/html/body/div[4]/div[4]/div[1]/div[1]/dl[1]/dd/span/text()')
avg_price = avg_price_xpath.extract()[0]
result_str = result_str + '【平均价格】' + avg_price + '\n'
detail_block_list = response.xpath(
'/html/body/div[4]/div[4]/div[1]')
for headline in detail_block_list.xpath('.//h3'):
head_str = headline.xpath('./text()').extract()[0]
if head_str == '基本信息':
result_str = result_str + \
'【' + \
head_str + '】\n'
for item in headline.xpath(
'../../div[@class="inforwrap clearfix"]/dl/dd'):
if len(item.xpath('./strong/text()')) != 0:
if len(item.xpath('./text()')) != 0:
result_str = result_str + \
item.xpath(
'./strong/text()').extract()[0]
result_str = result_str + \
item.xpath('./text()').extract()[0] + '\n'
# print(result_str)
# elif head_str == '交通状况':
# result_str = result_str + \
# '【' + \
# head_str + '】\n'
# tempstr = headline.xpath(
# '../../div[@class="inforwrap clearfix"]/dl/dt/text()').extract()[0]
# result_str = result_str + tempstr + '\n'
# # print(result_str)
# elif head_str == '周边信息':
# result_str = result_str + \
# '【' + \
# head_str + '】\n'
# for item in headline.xpath(
# '../../div[@class="inforwrap clearfix"]/dl/dt'):
# result_str = result_str + \
# item.xpath('./text()').extract()[0] + '\n'
# # print(result_str)
elif head_str == '就近楼群':
result_str = result_str + \
'【' + \
head_str + '】\n'
for item in headline.xpath(
'../../div[@class="inforwrap clearfix"]/dl/dd'):
result_str = result_str + \
item.xpath('./a/text()').extract()[0] + '\n'
result_str = result_str + self.seperator
# print(result_str)
with open(filename, 'a', errors='ignore') as f:
f.write(result_str)

【2】格式化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
""" <A formatter>
Copyright (C) <2017> Li W.H., Duan X
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
def GetDataFileList(path='.'):
""" Get the houses file list.
Arguments:
path: Dir path.
Returns:
file_list: the list of data file that find houses data.
"""
file_list = [x for x in os.listdir(path) if os.path.isfile(
x) and os.path.splitext(x)[1] == '.txt']
return file_list
def Parse(file_list):
""" Parse the txt file that find houses data.
Extract some import infomation such as house name,
avarage price, address and so on.
Arguments:
file_list: the list of data file that find houses data.
Returns:
houses_dict_list: the list that each item find the detail
dict of each house.
"""
HOUSE_NAME = '小区名称'
HOUSE_NAME_SPLITOR = '】'
HOUSE_ADDRESS = '小区地址'
HOUSE_ADDRESS_SPLITOR = ':'
HOUSE_AVG_PRICE = '平均价格'
HOUSE_AVG_PRICE_SPLITOR = '】'
AREA_OF_HOUSE_BELONGS_TO = '所属区域'
AREA_OF_HOUSE_BELONGS_TO_SPLITOR_1 = ':'
AREA_OF_HOUSE_BELONGS_TO_SPLITOR_2 = ' '
PROPERTY_CATEGORY = '物业类别'
PROPERTY_CATEGORY_SPLITOR = ':'
GREEN_RATE = '绿 化 率'
GREEN_RATE_SPLITOR = ':'
VOLUME_RATE = '容 积 率'
VOLUME_RATE_SPLITOR = ':'
PROPERTY_COSTS = '物 业 费'
PROPERTY_COSTS_SPLITOR = ':'
NO_INFO_NOW = '暂无信息'
DETAIL_LIST = [HOUSE_NAME, HOUSE_AVG_PRICE, HOUSE_ADDRESS, AREA_OF_HOUSE_BELONGS_TO,
PROPERTY_CATEGORY, GREEN_RATE, VOLUME_RATE, PROPERTY_COSTS]
houses_dict_list = []
for file_name in file_list:
raw_houses_string = ''
# read all lines as a string
with open(file_name, 'r', errors='ignore') as f:
for line in f.readlines():
raw_houses_string += line
# split the string to the houses raw info list
raw_houses_list = raw_houses_string.split('=\n')
raw_houses_details_list = []
for raw_house in raw_houses_list:
# format house raw info to lines
raw_houses_details = raw_house.split('\n')[:-1]
if len(raw_houses_details) == 0:
continue
# combine the all formated house raw info to a list
raw_houses_details_list.append(raw_houses_details)
for raw_house_details in raw_houses_details_list:
house_details_dict = {}
for raw_detail in raw_house_details:
# search house name
if re.search(HOUSE_NAME, raw_detail):
house_details_dict[HOUSE_NAME] = raw_detail.split(
HOUSE_NAME_SPLITOR)[-1]
# search house avarage price
elif re.search(HOUSE_AVG_PRICE, raw_detail):
# print(raw_detail)
house_details_dict[HOUSE_AVG_PRICE] = raw_detail.split(
HOUSE_AVG_PRICE_SPLITOR)[-1]
# search house address
elif re.search(HOUSE_ADDRESS, raw_detail):
house_details_dict[HOUSE_ADDRESS] = raw_detail.split(
HOUSE_ADDRESS_SPLITOR)[-1]
# search the area of house belongs to
elif re.search(AREA_OF_HOUSE_BELONGS_TO, raw_detail):
temp_detail_value = raw_detail.split(
AREA_OF_HOUSE_BELONGS_TO_SPLITOR_1)[-1]
detail_value = temp_detail_value.split(
AREA_OF_HOUSE_BELONGS_TO_SPLITOR_2)[0]
house_details_dict[AREA_OF_HOUSE_BELONGS_TO] = detail_value
# search the property category of house
elif re.search(PROPERTY_CATEGORY, raw_detail):
house_details_dict[PROPERTY_CATEGORY] = raw_detail.split(
PROPERTY_CATEGORY_SPLITOR)[-1]
# search the green rate
elif re.search(GREEN_RATE, raw_detail):
house_details_dict[GREEN_RATE] = raw_detail.split(
GREEN_RATE_SPLITOR)[-1]
# search the volume rate
elif re.search(VOLUME_RATE, raw_detail):
house_details_dict[VOLUME_RATE] = raw_detail.split(
VOLUME_RATE_SPLITOR)[-1]
# search the property costs
elif re.search(PROPERTY_COSTS, raw_detail):
house_details_dict[PROPERTY_COSTS] = raw_detail.split(
PROPERTY_COSTS_SPLITOR)[-1]
# Judge if all details are contained.
# If not, set to null.
house_details_dict_keys = house_details_dict.keys()
for detail_name in DETAIL_LIST:
if detail_name not in house_details_dict_keys:
house_details_dict[detail_name] = NO_INFO_NOW
houses_dict_list.append(house_details_dict)
return houses_dict_list

【3】通过高德地图api获取经纬度信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
""" <A toolto transfer position.>
Copyright (C) <2017> Li W.H., Duan X
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
def Geocode(address):
""" A tool that call the God-Map api.
Arguments:
address: the address to transfer.
Returns:
location: the transfered location.
"""
CITY_NAME = '上海'
parameters = {'address': address,
'key': 'your key',
'city': CITY_NAME}
base = 'http://restapi.amap.com/v3/geocode/geo'
try:
response = requests.get(base, parameters)
except Exception as e:
print('error!', e)
finally:
pass
answer = response.json()
return answer
def GETGodMapLocation(houses):
""" Get the location that corresponds to the house name.
Use the God-Map api to get the corresponding location.
Arguments:
houses_dict_list: the houses info.
Returns:
houses_dict_list_contains_loc: the houses info that
contains the location info.
"""
HOUSE_NAME = '小区名称'
HOUSE_LOCATION = '经纬度'
NO_INFO_NOW = '暂无信息'
houses_dict_list = houses.copy()
error_count = 0
count = 0
size = len(houses)
for house_dict in houses_dict_list:
# Count
count = count + 1
# Loading needs
if count % 1000 == 0:
print(count, '/', size)
address = house_dict[HOUSE_NAME]
answer = Geocode(address)
# print(answer)
# If find
if len(answer['geocodes']) != 0:
# print(address + "的经纬度:", answer['geocodes'][0]['location'])
house_dict[HOUSE_LOCATION] = answer['geocodes'][0]['location']
else:
# remaking the invalid address
# print('address remaking...')
if re.search(r'别墅', address):
re.sub(r'别墅', '', address)
else:
address = address + '小区'
# print('retransfering...')
# transfer again
answer = Geocode(address)
if len(answer['geocodes']) != 0:
# print(address + "的经纬度:", answer['geocodes'][0]['location'])
house_dict[HOUSE_LOCATION] = answer['geocodes'][0]['location']
else:
# print(address)
error_count += 1
house_dict[HOUSE_LOCATION] = NO_INFO_NOW
print('error counts: ', error_count)
return houses_dict_list

【4】存储成excel文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
""" <A tool to save the excel file.>
Copyright (C) <2017> Li W.H., Duan X
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
def Save2ExcelFile(houses):
""" Save the python based list file to excel file.
Arguments:
houses: the houses list.
"""
houses_dict_list = houses.copy()
house_list = []
# format the source data to fit the xlwt package
keys = houses[0].keys()
for key in keys:
house = []
house.append(key)
for house_dict in houses_dict_list:
house.append(house_dict[key])
house_list.append(house)
# return house_list
xls = ExcelWrite.Workbook()
sheet = xls.add_sheet('小区信息')
for i in range(len(house_list)):
for j in range(len(house_list[0])):
sheet.write(j, i, house_list[i][j])
xls.save('houses.xls')

结果展示

spider_house_txt_result

spider_house_excel_result