2018-08-01
Python爬虫代码———拉勾数据分析师岗位数据分析


import pandas as pd
import numpy as np

fi = 'F:\cs\python\code\lagou_dataanalysis_craping\lagou3.0.txt'

#read_csv()表示读取csv格式文件，'gb2312'表示csv文件格式的编码

df = pd.read_table(fi,  encoding='gbk')

df.head()
df = df.iloc[:,0].str.split('@@@', expand=True)
df.columns = ['city','companyFullName','companyId','companyLabelList','companyShortName','companySize','businessZones','firstType','secondType','education','industryField','positionId','positionAdvantage','positionName','positionLables','salary','workYear']
#df = pd.DataFrame(df)
#读取前五行
df.head()



df_duplicates=df.drop_duplicates(subset='positionId',keep='first')#keep='first'表示保留第一个，删除后面的重复值；keep='last'表示保留最后一个，删除前面的重复值
def cut_word(word,method):
    position=word.find('-')       #查找“7k-8k”这种形式"-"的位置
    length=len(word)         
    if position !=-1:       # "-1" 是False的意思，表示字符串中存在'-'
        bottomsalary=word[:position-1]
        topsalary=word[position+1:length-1]
    else:
        bottomsalary=word[:word.upper().find('K')]    #这里是指不存在'10k-15k'这种形式，数据中存在7k以上，k有的大写有的小写
        topsalary=bottomsalary
    if method=="bottom":        #获得工资下限
        return bottomsalary
    else:
        return topsalary          #获得工资的上限
df_duplicates['topsalary']=df_duplicates.salary.apply(cut_word,method="top")  # apply()函数形式：apply(func,*args,**kwargs)，*args相当于元组，**kwargs相当于字典
df_duplicates["bottomsalary"]=df_duplicates.salary.apply(cut_word,method="bottom")#apply()函数作用：用来间接的调用一个函数，并把参数传递给函数
df_duplicates.bottomsalary.astype('int')# 字符串转为数值型
df_duplicates.topsalary.astype('int')
df_duplicates["avgsalary"]=df_duplicates.apply(lambda x:(int(x.bottomsalary)+int(x.topsalary))/2,axis=1)  #lambda是一种函数，举例：lambda x:x+1,x是参数，x+1是表达式;axis=1表示作用于行
df_duplicates

#选出我们想要的内容进行后续分析
#总体薪酬情况
df_clean=df_duplicates[['city','companyShortName','companySize','education','positionName','positionLables','workYear','avgsalary','industryField']]
import matplotlib.pyplot as plt       
#matplotlib inline  #%matplotlib inline是jupyter自带的方式，允许图表在cell中输出。
plt.style.use("ggplot")    #使用R语言中的ggplot2配色作为绘图风格，为好看
from matplotlib.font_manager import FontProperties        #matplotlib.Font_manager 是一种字体管理工具
zh_font = FontProperties(fname="C:\\WINDOWS\\Fonts\\simsun.ttc")#matplotlib.Font_manager.FontProperties(fname) 是指定一种字体，C:\\WINDOWS\\Fonts\\simsun.ttc 是字体路径，直接复制到电脑搜索，你看能不能找到
fig=plt.figure(figsize=(8,5))        #关于绘图方面，文末放了一个链接，讲述的比较详细
ax=plt.subplot(111)
rect=ax.hist(df_duplicates["avgsalary"],bins=30)
ax.set_title(u'薪酬分布',fontProperties=zh_font)
ax.set_xlabel(u'K/月',fontProperties=zh_font)     
plt.xticks(range(5,100,5))     #xticks为x轴主刻度和次刻度设置颜色、大小、方向，以及标签大小。
plt.show()

#不同城市薪酬分布情况
ax=df_clean.boxplot(column='avgsalary',by='city',figsize=(9,7))
for label in ax.get_xticklabels():
    label.set_fontproperties(zh_font)
    
#不同学历的薪酬分布
ax=df_clean.boxplot(column='avgsalary',by='education',figsize=(9,7))
for label in ax.get_xticklabels():
    label.set_fontproperties(zh_font)

#招聘人数
df_clean.groupby(['city','education']).avgsalary.count().unstack()   #unstack()函数可进行行列转置，大家不妨去掉看下效果

#北京上海工作经验不同薪酬分布情况
df_bj_sh=df_clean[df_clean['city'].isin(['上海','北京'])]
ax=df_bj_sh.boxplot(column='avgsalary',by=['workYear','city'],figsize=(19,6))
for label_x in ax.get_xticklabels():
    label_x.set_fontproperties(zh_font)
    
#北上广深对数据分析职位需求量
def topN(df,n=5):
    counts=df.value_counts()    #value_counts()统计所有非零元素的个数  
    return counts.sort_values(ascending=False)[:n]    #sort_values()对数据进行排序，ascending是设置升序和降序
df_bj_sh_gz_sz=df_clean[df_clean['city'].isin(['上海','北京','广州','深圳'])]
df_bj_sh_gz_sz.groupby('city').positionName.apply(topN)

#公司所处行业领域词云图分析
import re  #re模块提供了对正则表达式的支持
import jieba as jb
from wordcloud import WordCloud
word_str = ','.join(df_clean['industryField']) # 以','为分隔符，将所有的元素合并成一个新的字符串,注意：csv文件中，单元格之间有逗号。
#对文本进行分词
word_split = jb.cut(word_str) #精确模式
#使用|作为分隔符
word_split1 = "|".join(word_split)
pattern=re.compile("移动|互联网|其他|金融|企业|服务|电子商务|O2O|数据|服务|医疗健康|游戏|社交网络|招聘|生活服务|文化娱乐|旅游|广告营销|教育|硬件|信息安全")
#匹配所有文本字符；pattern 我们可以理解为一个匹配模式，用re.compile()方法来获得这个模式
word_w=pattern.findall(word_split1)   #搜索word_split1，以列表形式返回全部能匹配的子串
word_s = str(word_w)
my_wordcloud = WordCloud(font_path="C:\\WINDOWS\\Fonts\\simsun.ttc",width=900,height=400,background_color="white").generate(word_s)
plt.imshow(my_wordcloud)
plt.axis("off")    #取出坐标轴
plt.show()
RP's Blog

学习总结思考感悟

Python爬虫代码———拉勾数据分析师岗位数据分析