1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
我所爬取的是校园新闻信息还有多玩LOL新闻版块的新闻
完成作业遇到的问题:
词云wordcloud的安装.
主要是不会怎么借助词云导出。
爬取校园新闻信息并且生成词云
import requests import string import re import jieba import matplotlib.pyplot as plt from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator from datetime import datetime newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/' res = requests.get(newsurl) #返回response对象 res.encoding='utf-8' from bs4 import BeautifulSoup soup = BeautifulSoup(res.text,'html.parser') def getKeynews(content): content = ''.join(re.findall('[\u4e00-\u9fa5]', content)) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容 # 去掉重复的字符生成集合 newSet = set(jieba._lcut(content)) #划分内容 newDict = {} #定义字典 for i in newSet: newDict[i] = content.count(i) deleteList, keynews = [], [] for i in newDict.keys(): if len(i) < 2: deleteList.append(i) #去掉单音无意义字符 for i in deleteList: del newDict[i] dictList = list(newDict.items()) dictList.sort(key=lambda item: item[1], reverse=True) # 排序 for dict in dictList: keynews.append(dict[0]) return keynews def writeFilekeynews(keywords): f = open('Filekeynews', 'a', encoding='utf-8') for word in keywords: f.write(" "+word) f.close() def writeNewsDetail(content): f=open('gzccNews.txt','a',encoding='utf-8') f.write("\n" + content) f.close() def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') content=soupd.select('.show-content')[0].text.strip() writeNewsDetail(content) keynews = getKeynews(content) writeFilekeynews(keynews) # def getWordCloud(): # keynewsTowordcloud = open('keyword.txt', 'r', encoding='utf-8').read() # print(keynewsTowordcloud) # backgroud_Image = plt.imread('bg.jpg') # wc = WordCloud(background_color='white', # 设置背景颜色 # mask=backgroud_Image, # 设置背景图片 # stopwords=STOPWORDS, # max_words=80, # 设置最大现实的字数 # font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf', # 设置字体格式,如不设置显示不了中文 # max_font_size=80, # 设置字体最大值 # random_state=30, # 设置有多少种随机生成状态,即有多少种配色方案 # ) # wc.generate(keynewsTowordcloud) # image_colors = ImageColorGenerator(backgroud_Image) # wc.recolor(color_func=image_colors) # plt.imshow(wc) # plt.axis('off') # plt.show() def getListPage(listPageUrl): res=requests.get(listPageUrl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') for news in soup.select('li'): if len(news.select('.news-list-title'))>0: a=news.select('a')[0].attrs['href'] getNewsDetail(a) firstPage='http://news.gzcc.cn/html/xiaoyuanxinwen/' getListPage(firstPage) for i in range(2,3): listpageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) f=open('keyword.txt','r').read() wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f) plt.imshow(wordcloud) plt.axis("off") plt.show() wordcloud.to_file('111.png')
爬取多玩LOL新闻版块
# -*- coding : UTF-8 -*-import requestsimport stringimport reimport jiebaimport matplotlib.pyplot as pltfrom wordcloud import WordCloud,STOPWORDS,ImageColorGeneratorfrom datetime import datetimefrom bs4 import BeautifulSoup# def getKeynews(content):# content = ''.join(re.findall('[\u4e00-\u9fa5]', content)) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容# # 去掉重复的字符生成集合# newSet = set(jieba._lcut(content)) #划分内容# newDict = {} #定义字典# for i in newSet:# newDict[i] = content.count(i)# deleteList, keynews = [], []# for i in newDict.keys():# if len(i) < 2:# deleteList.append(i) #去掉单音无意义字符# for i in deleteList:# del newDict[i]# dictList = list(newDict.items())# dictList.sort(key=lambda item: item[1], reverse=True) # 排序# for dict in dictList:# keynews.append(dict[0])# return keynews# def writeFilekeynews(keywords):# f = open('Filekeynews6', 'a', encoding='utf-8')# for word in keywords:# f.write(" "+word)# f.close()# def writeNewsDetail(content):# f=open('duowanNews.txt','a',encoding='utf-8')# f.write("\n" + content)# f.close()# def getNewsDetail(newsUrl):# resd = requests.get(newsUrl)# resd.encoding = 'utf-8'# soupd = BeautifulSoup(resd.text, 'html.parser')# content=soupd.select('.show-content')[0].text.strip()# writeNewsDetail(content)# keynews = getKeynews(content)# writeFilekeynews(keynews)# firstPage='http://lol.duowan.com/tag/172578469745.html'# for i in range(2,3):# listpageUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)# f=open('keyword.txt','r').read()# wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f)# plt.imshow(wordcloud)# plt.axis("off")# plt.show()# wordcloud.to_file('111.png')# f=open('keyword2.txt','r').read()# wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f)# plt.imshow(wordcloud)# plt.axis("off")# plt.show()def write_news_to_document(filename, content): f = open(filename, 'w', encoding='utf-8') for detail in content: f.write(detail['content']) f.close()# 将得到的关键词写入文件def write_keywords_to_document(filename, keywords): f = open(filename, 'w', encoding='utf-8') for word in keywords: f.write(' ' + word) f.close()# 通过jieba分词得到关键词def get_keywords(filename): f = open(filename, 'r', encoding='utf-8') content = f.read() f.close() word_set = set(jieba.lcut(''.join(re.findall("[\u4e00-\u9fa5_a-zA-Z0-9]", content)))) # 通过正则表达式选取中文,字母及数字字符数组,拼接为无标点字符内容,再转换为字符集合 word_dict = {} delete_list = [] keywords = [] for a in word_set: word_dict[a] = content.count(a) # 生成词云字典 for j in word_dict.keys(): if len(j) < 2: delete_list.append(j) # 生成单字无意义字符列表 for k in delete_list: del word_dict[k] # 在词云字典中删除无意义字符 dict_list = list(word_dict.items()) dict_list.sort(key=lambda item: item[1], reverse=True) for dict in dict_list: keywords.append(dict[0]) print(keywords) write_keywords_to_document("NewsKeyword.txt", keywords)# 获取详细新闻内容# def get_news_detail(news_url):# res_d = requests.get(news_url)# res_d.encoding = 'gbk'# soup_d = BeautifulSoup(res_d.text, 'html.parser')# content = ''# for p in range(0, len(soup_d.select(".text"))):# content += soup_d.select('.text')[p].text + '\n'# detail = { 'content': content}# return detaildef get_news_detail(news_url): res_d = requests.get(news_url) res_d.encoding = 'UTF-8' soup_d = BeautifulSoup(res_d.text, 'html.parser') content = '' for i in range(3, 15): content += (soup_d.select('p')[i].text)+ '\n' detail = { 'content': content} return detail# 获取新闻列表def get_news_list(list_url): res = requests.get(list_url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') page_detail = [] for newsList in soup.select('.m-list')[0].select('li'): a = newsList.select('a')[0].attrs['href'] b = 'http://lol.duowan.com/' + a news_detail = get_news_detail(b) page_detail.append(news_detail) return page_detail#主函数 url = "http://lol.duowan.com/tag/172578469745.html" Page_detail = get_news_list(url) print(Page_detail) write_news_to_document("News.txt", Page_detail) for i in range(2, 9): news_url = "http://lol.duowan.com/tag/172578469745_{}.html".format(i) Page_detail = get_news_list(url) write_news_to_document("News.txt", Page_detail) get_keywords("News.txt")f=open('NewsKeyword.txt','r').read()wordcloud=WordCloud(background_color="white",width=1000,height=860,margin=2,font_path='C:\Windows\Fonts\AdobeKaitiStd-Regular.otf').generate(f)plt.imshow(wordcloud)plt.axis("off")plt.show()wordcloud.to_file('777.png')
爬取这段的时候是LOL的MSI季中赛决赛结束后夺冠,可以看出RNG战队的UZI、Letme、Karsa、Ming、Xiaohu、Mlxg这几名选手的热度都很高,然后由于UZI、Letme、Karsa,这三名选手的表脸都很亮眼,所以词频数很高。