爬取百度热搜榜的热搜词并且生成词云

使用Python中的requests库或urllib库发送HTTP请求,获取网页源代码。例如,使用requests库发送请求

BeautifulSoup库、lxml库、re库等解析HTML,获取需要的信息

wordcloud 库生成需要的词云图片  下面附带完成的代码

import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt

url = 'https://top.baidu.com/board?tab=realtime'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
response = requests.get(url, headers=headers)
response.encoding=response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
hot_items = soup.select('.c-single-text-ellipsis')

# 获取热搜词
hot_words = [item.text for item in hot_items]

# 将热搜词转为字符串
text = ' '.join(hot_words)
print(text)
# 生成词云图
wordcloud = WordCloud(background_color="white", font_path="simhei.ttf", max_words=100,min_font_size=12,width=1000, height=700).generate(text)
#wordcloud.to_file('baidu.png') # 需要保存到本地 不用注释
# 显示词云图
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

 

 

THE END