获取数据
爬取房价数据,并存入MongoDB
代码写的比较随意,凑合着看吧.(2019年4月11日)
import sys
import time
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
Client = MongoClient()
collection = Client.lianjia.secondHouse
prefix = "https://nj.lianjia.com"
areas = ["/ershoufang/gulou/","/ershoufang/jianye/","/ershoufang/qinhuai/",
"/ershoufang/xuanwu/","/ershoufang/yuhuatai/","/ershoufang/qixia/","/ershoufang/jiangning/",
"/ershoufang/pukou/","/ershoufang/liuhe/","/ershoufang/lishui/","/ershoufang/gaochun/",
]
areaChineses = ['鼓楼', '建邺', '秦淮', '玄武', '雨花台', '栖霞', '江宁', '浦口', '六合', '丽水', '高淳']
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
city='南京'
curtime = datetime.now()
curtime.utcoffset()
Aindex = 0
for area,areaCN in zip(areas, areaChineses):
for index in range(1,101):
page = "" if index == 1 else ("pg" + str(index))
url = prefix + area + page
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
houses = soup.find_all('li', class_='clear LOGCLICKDATA')
try:
for house in houses:
title = str(house.find('div', 'title').find('a').text).strip()
url = str(house.find('div', 'title').find('a')['href']).strip()
basicInfo = str( house.find('div', 'houseInfo').text).strip().split('|')
address = basicInfo[0].strip()
houseType = basicInfo[1].strip()
size = float(basicInfo[2].strip()[:-2])
followers = int(str(house.find('div', 'followInfo').text.split("/")[0].split("人")[0]).strip())
hasSeen = int(str(house.find('div', 'followInfo').text.split("/")[1].split("次")[0][2:]).strip())
price = float(str(house.find('div', 'totalPrice').find("span").text).strip())
meanPrice = float(str(house.find('div', 'unitPrice').find("span").text.split("元")[0][2:]).strip())
data = dict(
city=city,
area=areaCN,
title=title,
url=url,
address=address,
houseType=houseType,
size=size,
followers=followers,
hasSeen=hasSeen,
price=price,
meanPrice=meanPrice,
)
collection.insert_one(data)
Aindex += 1
except:
print("第", Aindex, "个房产插入异常")
finally:
time.sleep(10)
从MongoDB取数据并格式化到本地
import pymongo
import pandas as pd
collection = client.lianjia.secondHouse
df = pd.DataFrame(list(collection.find())
df.to_csv('lianjia.csv', index=False)
从本地读取数据
数据很干净
import pandas as pd
df = pd.read_csv("./lianjia.csv")
del df["_id"]
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29300 entries, 0 to 29299
Data columns (total 11 columns):
address 29300 non-null object
area 29300 non-null object
city 29300 non-null object
followers 29300 non-null int64
hasSeen 29300 non-null int64
houseType 29300 non-null object
meanPrice 29300 non-null float64
price 29300 non-null float64
size 29300 non-null float64
title 29300 non-null object
url 29300 non-null object
dtypes: float64(3), int64(2), object(6)
memory usage: 2.5+ MB
数据分析
对数据进行基本的分析, 获取数据之间显式的的关联性
户型分析
热度分析
最受人青睐的是2室1厅
, 其次是3室2厅
,2室2厅
,
最不受青睐的2室0厅
,3室0厅
,4室1厅
,5室3厅
,
<iframe width="100%" height="500" frameborder="0" scrolling="no" src="//plot.ly/~Jansora/5.embed"></iframe>
Python
代码
x = []
y = []
for label, _df in df.groupby(by='houseType'):
x.append(label)
y.append(_df.shape[0])
py.iplot([go.Bar(x=x,y=y)], filename='linajia-houseType')