博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
爬虫大作业
阅读量:6348 次
发布时间:2019-06-22

本文共 8436 字,大约阅读时间需要 28 分钟。

1.选一个自己感兴趣的主题。

2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

3.对爬了的数据进行文本分析,生成词云。

4.对文本分析结果进行解释说明。

5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

6.最后提交爬取的全部数据、爬虫及数据分析源代码。

 

我所爬取的是校园新闻信息还有多玩LOL新闻版块的新闻

完成作业遇到的问题:

词云wordcloud的安装.

主要是不会怎么借助词云导出。

爬取校园新闻信息并且生成词云

 
import requests import string import re import jieba import matplotlib.pyplot as plt from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator from datetime import  datetime newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(newsurl) #返回response对象 res.encoding='utf-8' from bs4 import BeautifulSoup soup = BeautifulSoup(res.text,'html.parser') def getKeynews(content):     content = ''.join(re.findall('[\u4e00-\u9fa5]', content))  # 通过正则表达式选取中文字符数组,拼接为无标点字符内容     # 去掉重复的字符生成集合     newSet = set(jieba._lcut(content))               #划分内容     newDict = {}            #定义字典     for i in newSet:         newDict[i] = content.count(i)     deleteList, keynews = [], []     for i in newDict.keys():         if len(i) < 2:          deleteList.append(i)  #去掉单音无意义字符     for i in deleteList:         del newDict[i]     dictList = list(newDict.items())     dictList.sort(key=lambda item: item[1], reverse=True)  # 排序     for dict in dictList:         keynews.append(dict[0])     return keynews def writeFilekeynews(keywords):     f = open('Filekeynews', 'a', encoding='utf-8')     for word in keywords:         f.write(" "+word)     f.close() def writeNewsDetail(content):     f=open('gzccNews.txt','a',encoding='utf-8')     f.write("\n" + content)     f.close() def getNewsDetail(newsUrl):     resd = requests.get(newsUrl)     resd.encoding = 'utf-8'     soupd = BeautifulSoup(resd.text, 'html.parser')     content=soupd.select('.show-content')[0].text.strip()     writeNewsDetail(content)     keynews = getKeynews(content)     writeFilekeynews(keynews) # def getWordCloud(): #     keynewsTowordcloud = open('keyword.txt', 'r', encoding='utf-8').read() #     print(keynewsTowordcloud) #     backgroud_Image = plt.imread('bg.jpg') #     wc = WordCloud(background_color='white',  # 设置背景颜色 #                    mask=backgroud_Image,  # 设置背景图片 #                    stopwords=STOPWORDS, #                    max_words=80,  # 设置最大现实的字数 #                    font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf',  # 设置字体格式,如不设置显示不了中文 #                    max_font_size=80,  # 设置字体最大值 #                    random_state=30,  # 设置有多少种随机生成状态,即有多少种配色方案 #                    ) #     wc.generate(keynewsTowordcloud) #     image_colors = ImageColorGenerator(backgroud_Image) #     wc.recolor(color_func=image_colors) #     plt.imshow(wc) #     plt.axis('off') #     plt.show() def getListPage(listPageUrl):     res=requests.get(listPageUrl)     res.encoding='utf-8'     soup=BeautifulSoup(res.text,'html.parser')     for news in soup.select('li'):         if len(news.select('.news-list-title'))>0:             a=news.select('a')[0].attrs['href']             getNewsDetail(a) firstPage='http://news.gzcc.cn/html/xiaoyuanxinwen/' getListPage(firstPage) for i in range(2,3):     listpageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) f=open('keyword.txt','r').read() wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f) plt.imshow(wordcloud) plt.axis("off") plt.show() wordcloud.to_file('111.png')
 

 爬取多玩LOL新闻版块

# -*- coding : UTF-8 -*-import requestsimport stringimport reimport jiebaimport matplotlib.pyplot as pltfrom wordcloud import WordCloud,STOPWORDS,ImageColorGeneratorfrom datetime import  datetimefrom bs4 import BeautifulSoup# def getKeynews(content):#     content = ''.join(re.findall('[\u4e00-\u9fa5]', content))  # 通过正则表达式选取中文字符数组,拼接为无标点字符内容#     # 去掉重复的字符生成集合#     newSet = set(jieba._lcut(content))               #划分内容#     newDict = {}            #定义字典#     for i in newSet:#         newDict[i] = content.count(i)#     deleteList, keynews = [], []#     for i in newDict.keys():#         if len(i) < 2:#          deleteList.append(i)  #去掉单音无意义字符#     for i in deleteList:#         del newDict[i]#     dictList = list(newDict.items())#     dictList.sort(key=lambda item: item[1], reverse=True)  # 排序#     for dict in dictList:#         keynews.append(dict[0])#     return keynews# def writeFilekeynews(keywords):#     f = open('Filekeynews6', 'a', encoding='utf-8')#     for word in keywords:#         f.write(" "+word)#     f.close()# def writeNewsDetail(content):#     f=open('duowanNews.txt','a',encoding='utf-8')#     f.write("\n" + content)#     f.close()# def getNewsDetail(newsUrl):#     resd = requests.get(newsUrl)#     resd.encoding = 'utf-8'#     soupd = BeautifulSoup(resd.text, 'html.parser')#     content=soupd.select('.show-content')[0].text.strip()#     writeNewsDetail(content)#     keynews = getKeynews(content)#     writeFilekeynews(keynews)# firstPage='http://lol.duowan.com/tag/172578469745.html'# for i in range(2,3):#     listpageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)# f=open('keyword.txt','r').read()# wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f)# plt.imshow(wordcloud)# plt.axis("off")# plt.show()# wordcloud.to_file('111.png')# f=open('keyword2.txt','r').read()# wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f)# plt.imshow(wordcloud)# plt.axis("off")# plt.show()def write_news_to_document(filename, content):    f = open(filename, 'w', encoding='utf-8')    for detail in content:        f.write(detail['content'])    f.close()# 将得到的关键词写入文件def write_keywords_to_document(filename, keywords):    f = open(filename, 'w', encoding='utf-8')    for word in keywords:        f.write('  ' + word)    f.close()# 通过jieba分词得到关键词def get_keywords(filename):    f = open(filename, 'r', encoding='utf-8')    content = f.read()    f.close()    word_set = set(jieba.lcut(''.join(re.findall("[\u4e00-\u9fa5_a-zA-Z0-9]", content))))    # 通过正则表达式选取中文,字母及数字字符数组,拼接为无标点字符内容,再转换为字符集合    word_dict = {}    delete_list = []    keywords = []    for a in word_set:        word_dict[a] = content.count(a)  # 生成词云字典    for j in word_dict.keys():        if len(j) < 2:            delete_list.append(j)  # 生成单字无意义字符列表    for k in delete_list:        del word_dict[k]  # 在词云字典中删除无意义字符    dict_list = list(word_dict.items())    dict_list.sort(key=lambda item: item[1], reverse=True)    for dict in dict_list:        keywords.append(dict[0])    print(keywords)    write_keywords_to_document("NewsKeyword.txt", keywords)# 获取详细新闻内容# def get_news_detail(news_url):#     res_d = requests.get(news_url)#     res_d.encoding = 'gbk'#     soup_d = BeautifulSoup(res_d.text, 'html.parser')#     content = ''#     for p in range(0, len(soup_d.select(".text"))):#         content += soup_d.select('.text')[p].text + '\n'#     detail = {
'content': content}# return detaildef get_news_detail(news_url): res_d = requests.get(news_url) res_d.encoding = 'UTF-8' soup_d = BeautifulSoup(res_d.text, 'html.parser') content = '' for i in range(3, 15): content += (soup_d.select('p')[i].text)+ '\n' detail = {
'content': content} return detail# 获取新闻列表def get_news_list(list_url): res = requests.get(list_url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') page_detail = [] for newsList in soup.select('.m-list')[0].select('li'): a = newsList.select('a')[0].attrs['href'] b = 'http://lol.duowan.com/' + a news_detail = get_news_detail(b) page_detail.append(news_detail) return page_detail#主函数 url = "http://lol.duowan.com/tag/172578469745.html" Page_detail = get_news_list(url) print(Page_detail) write_news_to_document("News.txt", Page_detail) for i in range(2, 9): news_url = "http://lol.duowan.com/tag/172578469745_{}.html".format(i) Page_detail = get_news_list(url) write_news_to_document("News.txt", Page_detail) get_keywords("News.txt")f=open('NewsKeyword.txt','r').read()wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f)plt.imshow(wordcloud)plt.axis("off")plt.show()wordcloud.to_file('777.png')

爬取这段的时候是LOL的MSI季中赛决赛结束后夺冠,可以看出RNG战队的UZI、Letme、Karsa、Ming、Xiaohu、Mlxg这几名选手的热度都很高,然后由于UZI、Letme、Karsa,这三名选手的表脸都很亮眼,所以词频数很高。

 

转载于:https://www.cnblogs.com/swxvico/p/8973164.html

你可能感兴趣的文章
linux进程管理
查看>>
使用md5加密与解密
查看>>
学生宿舍信息查询
查看>>
ClassLoader类加载器
查看>>
nginx中的定时器事件
查看>>
实战:微信小程序支付开发具体流程
查看>>
Rancher 2.2.4发布,CVE修复,项目监控回归!
查看>>
MySQL主从复制
查看>>
JDBC 批处理
查看>>
你刚吃的兰州牛肉面_背后就藏着大数据
查看>>
java.lang.OutOfMemoryError: PermGen space内存溢出
查看>>
源代码数据安全加密技术分析
查看>>
SEO专题之整合营销搜索
查看>>
智慧农业云平台APP[中易云智慧农业物联网]
查看>>
使用join工具进行弱口令检测
查看>>
配置nginx防盗链和HTTPS
查看>>
shell脚本基础----变量
查看>>
软件入门的知识之程序设计语言Java和C#的简单介绍和对比[图]
查看>>
精益生产中实现高效现场管理的意义是什么?
查看>>
ceph
查看>>