豆瓣图书Top250
import re import pandas as pd import time import urllib.request import random from lxml.html import fromstring from bs4 import BeautifulSoup def download(url): print("Downloading:", url) request = urllib.request.Request(url) request.add_header("User-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36") resp = urllib.request.urlopen(request) html = resp.read().decode("utf-8") time.sleep(random.randint(1,5)) return html name = [] rate = [] info = [] for k in range(10): url = download("https://book.douban.com/top250?start={}".format(k*25)) tree = fromstring(url) soup = BeautifulSoup(url) for k in range(25): name.append(tree.xpath("//*[@id="content"]/div/div[1]/div/table[{}]/tr/td[2]/div[1]/a/text()".format(k+1))[0].strip()) rate.append(soup.find_all("span",{"class":"rating_nums"})[k].get_text()) info.append(soup.find_all("p",{"class":"pl"})[k].get_text()) name_pd = pd.DataFrame(name) rate_pd = pd.DataFrame(rate) info_pd = pd.DataFrame(info) book_data = pd.concat([name_pd, rate_pd, info_pd], axis=1) book_data.columns=["书名", "评分", "信息"] book_data.head() Info = book_data["信息"].apply(lambda x: x.split("/")) book_data["作家"] = Info.apply(lambda x: x[0]) book_data["出版社"] = Info.apply(lambda x: x[-3]) book_data["出版年"] = Info.apply(lambda x: x[-2]) book_data["定价"] = Info.apply(lambda x: x[-1]) book_data.iloc[9,4] = "群众出版社" book_data.iloc[9,5] = "1981" book_data.iloc[184,5] = "1996" book_data.iloc[184,6] = "0" f = lambda x: re.search("[0-9]{4,4}", x).group() book_data["出版年"] = book_data["出版年"].apply(f) g = lambda x: re.search("([0-9]+.[0-9]+|[0-9]+)", x).group() book_data["定价"] = book_data["定价"].apply(g) book_data = book_data.drop(["信息"], axis =1) book_data.to_csv("book.csv",sep=",",index=False,header=True,encoding="utf_8_sig")
最热城市
import weather_rank import lat_lon import map_weather import pandas as pd markers = [] ranks = weather_rank.get_hot_cities() for rank in ranks: lng_lat = lat_lon.getlnglat(rank["城市"]) city_temparory = rank["城市"] + ":" + rank["温度"] marker_size = 60 - int(rank["排名"]) * 5 if rank["排名"] == "1": marker_color = "rgb(255,0,0)" elif rank["排名"] == "2": marker_color = "rgb(0,255,0)" elif rank["排名"] == "3": marker_color = "rgb(0,0,255)" else: marker_color = "rgb(100,100,100)" markers.append([marker_size, city_temparory, lng_lat[0], lng_lat[1], marker_color]) places = pd.DataFrame(markers, columns=["marker","name","lon", "lat", "color"]) map_weather.save_map(places=places)
获取经纬度
import json from urllib.request import urlopen, quote import requests,csv def getlnglat(address): url = "http://api.map.baidu.com/geocoding/v3/" output = "json" ak = "***********************" add = quote(address) uri = url + "?" + "address=" + add + "&output=" + output + "&ak=" + ak req = urlopen(uri) res = req.read().decode() temp = json.loads(res) lng = temp["result"]["location"]["lng"] lat = temp["result"]["location"]["lat"] return [lng, lat]
地图中增加点
import pandas as pd import plotly as py import plotly.graph_objects as go def save_map(places): pyplt = py.offline.plot fig = go.Figure(go.Scattermapbox(mode="markers", lon = places.lon, lat = places.lat, hovertext = places.name, hoverinfo = "text", marker=go.scattermapbox.Marker( size=places.marker, color=places.color, opacity=0.7 ), )) fig.update_layout(mapbox = {"accesstoken": "*****************", "center": {"lat": 35.3, "lon": 100.6}, "zoom": 4, "style": "outdoors"}, title = dict(x=0.5, xref="paper"), margin = {"l": 10, "r": 10, "t": 50, "b": 50}) pyplt(fig, filename="1.html")
小说人物相关性
import networkx as nx import matplotlib.pyplot as plt import jieba import codecs import jieba.posseg as pseg names = {} relationships = {} lineNames = [] def build_relationship(dict, novel_name, code): # jieba.load_userdict(dict) with codecs.open(novel_name, encoding=code) as f: for line in f.readlines(): poss = pseg.cut(line, use_paddle=True) # 分词并返回该词词性 lineNames.append([]) # 为新读入的一段添加人物名称列表 for w in poss: # 当分词长度小于2或该词词性不为nr时认为该词不为人名 if w.flag != "nr" or len(w.word) < 2: continue # 为当前段的环境增加一个人物 lineNames[-1].append(w.word) if names.get(w.word) is None: names[w.word] = 0 relationships[w.word] = {} names[w.word] += 1 # 对于每一段 for line in lineNames: # 每段中的任意两个人 for name1 in line: for name2 in line: if name1 == name2: continue # 若两人尚未同时出现则新建项 if relationships[name1].get(name2) is None: relationships[name1][name2]= 1 # 两人共同出现次数加 1 else: relationships[name1][name2] = relationships[name1][name2]+ 1 def write_file(): with codecs.open("tlbb_relationship.txt", "a+", "utf-8") as f: for name, edges in relationships.items(): for v, w in edges.items(): if w > 9: f.write(name + " " + v + " " + str(w) + " ") build_relationship("tlbb_names.txt", "天龙八部.txt", "gb18030") write_file() a = [] f = open("tlbb_relationship.txt") line = f.readline() while line: if not line.isspace(): #保存文件是以空格分离的 a.append(line.split()) line = f.readline() f.close() G = nx.Graph() G.add_weighted_edges_from(a) nx.draw(G,with_labels=True,font_size=9,node_size=800,node_color="r") plt.rcParams["font.sans-serif"] = ["Arial Unicode MS"] plt.show()
B站视频排行
import requests from bs4 import BeautifulSoup import csv import pandas as pd url = "https://www.bilibili.com/v/popular/rank/all" page = requests.get(url) soup = BeautifulSoup(page.content, "html.parser") all_products = [] products = soup.select("li.rank-item") for product in products: rank = product.select("div.num")[0].text name = product.select("div.info > a")[0].text.strip() play = product.select("span.data-box")[0].text comment = product.select("span.data-box")[1].text up = product.select("span.data-box")[2].text url = product.select("div.info > a")[0].attrs["href"] all_products.append({ "视频排名":rank, "视频名": name, "播放量": play, "弹幕量": comment, "up主": up, "视频链接": url }) keys = all_products[0].keys() with open("B站视频热榜TOP100.csv", "w", newline="", encoding="utf-8-sig") as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(all_products) pd.DataFrame(all_products,columns=keys).to_csv("B站视频热榜TOP100.csv", encoding="utf-8-sig")
豆瓣电影
from unicodedata import name import requests from bs4 import BeautifulSoup import pandas as pd from wordcloud import WordCloud import jieba import re import math import random import time import collections def getfilminfo(url,headers,name): r = requests.get(url, headers=headers, timeout=10) r.raise_for_status() r.encoding = "utf-8" soup = BeautifulSoup(r.text, "html.parser") comments = soup.find_all(attrs={"class": "short"}) with open(name + ".txt", "a", encoding="utf-8") as f: f.write(str(comments)) def all_comments(url, headers): r = requests.get(url, headers=headers, timeout=10) r.raise_for_status() r.encoding = "utf-8" soup = BeautifulSoup(r.text, "html.parser") number = soup.find(attrs={"class": "is-active"}) p1 = re.compile(r"[(](.*?)[)]", re.S) number = re.findall(p1, str(number)) return number def load_data(num, name): url = "https://movie.douban.com/subject/" + num + "/comments?start=0&status=F&sort=new_score" headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0", "Host":"movie.douban.com", "Origin":"movie.douban.com", "Cookie": "*********" } number = all_comments(url, headers) pages = math.ceil(int(number[0])/20) for i in range(pages): url = "https://movie.douban.com/subject/" + num + "/comments?start={}&status=F&sort=new_score".format(i * 20) time.sleep(random.randint(1,5)) getfilminfo(url, headers, name) def draw_word_cloud(path): with open(path) as fp: text = fp.read() seg_list_words = jieba.cut(text) obj_list = [] remove_words = [line.strip() for line in open("/Users/universe/Documents/python/Excel基础操作/词云/stopWords.txt", encoding="UTF-8").readlines()] for word in seg_list_words: if word not in remove_words: obj_list.append(word) word_counts = collections.Counter(obj_list) wc = WordCloud( font_path="/System/Library/Fonts/Supplemental/Arial Unicode.ttf", max_words=50, max_font_size=100, mode="RGBA" ).generate_from_frequencies(word_counts) image_pro = wc.to_image() image_pro.show() draw_word_cloud("狙击手.txt") draw_word_cloud("长津湖之水门桥.txt")
小说人物词云
from email.mime import image from wordcloud import WordCloud import jieba import matplotlib.pyplot as plt import numpy as np import PIL.Image as image file_name = "天龙八部.txt" with open(file_name, encoding="gb18030") as f: txt = f.read() cut = "" remove_words = [line.strip() for line in open("/Users/universe/Documents/python/Excel基础操作/词云/stopWords.txt", encoding="UTF-8").readlines()] for item in jieba.cut(txt): if item == "萧峰": item = "乔峰" if item not in remove_words: cut += (item + " ") mask = np.array(image.open("kongfu.jpeg")) wordcloud = WordCloud( mask=mask, font_path= "/System/Library/Fonts/Supplemental/Arial Unicode.ttf", background_color="white", ).generate(cut) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()
豆瓣Top250电影
import os import re import time import requests from bs4 import BeautifulSoup from openpyxl import Workbook, load_workbook import pandas as pd import random from pyecharts import options as opts from pyecharts.charts import Bar def getonepagelist(url,headers): try: r = requests.get(url, headers=headers, timeout=10) r.raise_for_status() r.encoding = "utf-8" soup = BeautifulSoup(r.text, "html.parser") lsts = soup.find_all(attrs={"class": "hd"}) for lst in lsts: href = lst.a["href"] time.sleep(random.randint(1,5)) getfilminfo(href, headers) except: print("getonepagelist error!") def getfilminfo(url,headers): filminfo = [] r = requests.get(url, headers=headers, timeout=10) r.raise_for_status() r.encoding = "utf-8" soup = BeautifulSoup(r.text, "html.parser") # 片名 name = soup.find(attrs={"property": "v:itemreviewed"}).text.split(" ")[0] # 上映年份 year = soup.find(attrs={"class": "year"}).text.replace("(","").replace(")","") # 评分 score = soup.find(attrs={"property": "v:average"}).text # 评价人数 votes = soup.find(attrs={"property": "v:votes"}).text infos = soup.find(attrs={"id": "info"}).text.split(" ")[1:11] # 导演 director = infos[0].split(": ")[1] # 编剧 scriptwriter = infos[1].split(": ")[1] # 主演 actor = infos[2].split(": ")[1] # 类型 filmtype = infos[3].split(": ")[1] # 国家/地区 area = infos[4].split(": ")[1] if "." in area: area = infos[5].split(": ")[1].split(" / ")[0] # 语言 language = infos[6].split(": ")[1].split(" / ")[0] else: area = infos[4].split(": ")[1].split(" / ")[0] # 语言 language = infos[5].split(": ")[1].split(" / ")[0] if "大陆" in area or "香港" in area or "台湾" in area: area = "中国" if "戛纳" in area: area = "法国" # 时长 times0 = soup.find(attrs={"property": "v:runtime"}).text times = re.findall("d+", times0)[0] filminfo.append(name) filminfo.append(year) filminfo.append(score) filminfo.append(votes) filminfo.append(director) filminfo.append(scriptwriter) filminfo.append(actor) filminfo.append(filmtype) filminfo.append(area) filminfo.append(language) filminfo.append(times) filepath = "./TOP250.xlsx" insert2excel(filepath,filminfo) def insert2excel(filepath,allinfo): try: if not os.path.exists(filepath): tableTitle = ["片名","上映年份","评分","评价人数","导演","编剧","主演","类型","国家/地区","语言","时长(分钟)"] wb = Workbook() ws = wb.active ws.title = "sheet1" ws.append(tableTitle) wb.save(filepath) wb = load_workbook(filepath) ws = wb.active ws.title = "sheet1" ws.append(allinfo) wb.save(filepath) return True except: return False def load_data(): for i in range(11): print(f"正在爬取第{i}页,请稍等...") url = "https://movie.douban.com/top250?start={}&filter=".format(i * 25) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", "Host":"movie.douban.com", "Origin":"movie.douban.com", "Cookie": "" } getonepagelist(url, headers) data = pd.read_excel("TOP250.xlsx") def getzoombar(data): year_counts = data["上映年份"].value_counts() year_counts.columns = ["上映年份", "数量"] year_counts = year_counts.sort_index() c = ( Bar() .add_xaxis(list(year_counts.index)) .add_yaxis("上映数量", year_counts.values.tolist()) .set_global_opts( title_opts=opts.TitleOpts(title="各年份上映电影数量"), yaxis_opts=opts.AxisOpts(name="上映数量"), xaxis_opts=opts.AxisOpts(name="上映年份"), datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")],) .render("各年份上映电影数量.html") ) def getscorebar(data): df = data.sort_values(by="评价人数", ascending=True) c = ( Bar() .add_xaxis(df["片名"].values.tolist()[-20:]) .add_yaxis("评价人数", df["评价人数"].values.tolist()[-20:]) .reversal_axis() .set_global_opts( title_opts=opts.TitleOpts(title="电影评价人数"), yaxis_opts=opts.AxisOpts(name="片名"), xaxis_opts=opts.AxisOpts(name="人数"), datazoom_opts=opts.DataZoomOpts(type_="inside"), ) .set_series_opts(label_opts=opts.LabelOpts(position="right")) .render("电影评价人数前二十.html") ) def getcountrybar(data): country_counts = data["国家/地区"].value_counts() country_counts.columns = ["国家/地区", "数量"] country_counts = country_counts.sort_values(ascending=True) c = ( Bar() .add_xaxis(list(country_counts.index)[-10:]) .add_yaxis("地区上映数量", country_counts.values.tolist()[-10:]) .reversal_axis() .set_global_opts( title_opts=opts.TitleOpts(title="地区上映电影数量"), yaxis_opts=opts.AxisOpts(name="国家/地区"), xaxis_opts=opts.AxisOpts(name="上映数量"), ) .set_series_opts(label_opts=opts.LabelOpts(position="right")) .render("各地区上映电影数量前十.html") ) getcountrybar(data) getscorebar(data)
城市温度爬取
import requests from bs4 import BeautifulSoup def get_weather_content(): url = "https://weather.cma.cn/" page = requests.get(url) soup = BeautifulSoup(page.content, "html5lib") return soup def get_hot_cities(): soup = get_weather_content() hot_cities = [] ranks = soup.select("div.hb")[0].select("div.tab-content")[0] hot_rank = ranks.find_all(id="HOT")[0] hot_links = hot_rank.select("a") for link in hot_links: rank = link.select("div.rank")[0].get_text(separator=" ", strip="True") city = link.select("div.sname")[0].get_text(separator=" ", strip="True") province = link.select("div.pname")[0].get_text(separator=" ", strip="True") temporature = link.select("div.value")[0].get_text(separator=" ", strip="True") if rank != "排名": hot_cities.append({ "排名": rank, "城市":city, "省市自治区": province, "温度": temporature, }) return hot_cities