豆瓣图书Top250

import re
import pandas as pd
import time
import urllib.request
import random
from lxml.html import fromstring
from bs4 import BeautifulSoup

def download(url):
    print("Downloading:", url)
    request = urllib.request.Request(url)
    request.add_header("User-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")
    resp = urllib.request.urlopen(request)
    html = resp.read().decode("utf-8")
    time.sleep(random.randint(1,5))
    return html

name = []
rate = []
info = []

for k in range(10):
    url = download("https://book.douban.com/top250?start={}".format(k*25))
    tree = fromstring(url)
    soup = BeautifulSoup(url)

    for k in range(25):
        name.append(tree.xpath("//*[@id="content"]/div/div[1]/div/table[{}]/tr/td[2]/div[1]/a/text()".format(k+1))[0].strip())
        rate.append(soup.find_all("span",{"class":"rating_nums"})[k].get_text())
        info.append(soup.find_all("p",{"class":"pl"})[k].get_text())
name_pd = pd.DataFrame(name)
rate_pd = pd.DataFrame(rate)
info_pd = pd.DataFrame(info)

book_data = pd.concat([name_pd, rate_pd, info_pd], axis=1)
book_data.columns=["书名", "评分", "信息"]
book_data.head()

Info = book_data["信息"].apply(lambda x: x.split("/"))

book_data["作家"] = Info.apply(lambda x: x[0])
book_data["出版社"] = Info.apply(lambda x: x[-3])
book_data["出版年"] = Info.apply(lambda x: x[-2])
book_data["定价"] = Info.apply(lambda x: x[-1])

book_data.iloc[9,4] = "群众出版社"
book_data.iloc[9,5] = "1981"
book_data.iloc[184,5] = "1996"
book_data.iloc[184,6] = "0"

f = lambda x: re.search("[0-9]{4,4}", x).group()
book_data["出版年"] = book_data["出版年"].apply(f)
g = lambda x: re.search("([0-9]+.[0-9]+|[0-9]+)", x).group()
book_data["定价"] = book_data["定价"].apply(g)

book_data = book_data.drop(["信息"], axis =1)

book_data.to_csv("book.csv",sep=",",index=False,header=True,encoding="utf_8_sig")

最热城市

import weather_rank
import lat_lon
import map_weather
import pandas as pd

markers = []
ranks = weather_rank.get_hot_cities()
for rank in ranks:
    lng_lat = lat_lon.getlnglat(rank["城市"])
    city_temparory = rank["城市"] + ":" + rank["温度"]
    marker_size = 60 - int(rank["排名"]) * 5
    if rank["排名"] == "1":
        marker_color = "rgb(255,0,0)"
    elif rank["排名"] == "2":
        marker_color = "rgb(0,255,0)"
    elif rank["排名"] == "3":
        marker_color = "rgb(0,0,255)"
    else:
        marker_color = "rgb(100,100,100)"
    markers.append([marker_size, city_temparory, lng_lat[0], lng_lat[1], marker_color])
    
places = pd.DataFrame(markers, columns=["marker","name","lon", "lat", "color"])
map_weather.save_map(places=places)

获取经纬度

import json
from urllib.request import urlopen, quote
import requests,csv

def getlnglat(address):
    url = "http://api.map.baidu.com/geocoding/v3/"
    output = "json"
    ak = "***********************" 
    add = quote(address)
    uri = url + "?" + "address=" + add + "&output=" + output + "&ak=" + ak
    req = urlopen(uri)
    res = req.read().decode()
    temp = json.loads(res)
    lng = temp["result"]["location"]["lng"]
    lat = temp["result"]["location"]["lat"]
    return [lng, lat]

地图中增加点

import pandas as pd
import plotly as py
import plotly.graph_objects as go

def save_map(places):
    pyplt = py.offline.plot
    fig = go.Figure(go.Scattermapbox(mode="markers", 
                                 lon = places.lon,
                                 lat = places.lat,
                                 hovertext = places.name,
                                 hoverinfo = "text",
                                 marker=go.scattermapbox.Marker(
                                    size=places.marker,
                                    color=places.color,
                                    opacity=0.7
                                ),
                            ))
    
    fig.update_layout(mapbox = {"accesstoken": "*****************", 
                        "center": {"lat": 35.3, "lon": 100.6}, "zoom": 4, "style": "outdoors"},
                  title = dict(x=0.5, xref="paper"),
                  margin = {"l": 10, "r": 10, "t": 50, "b": 50})

    pyplt(fig, filename="1.html")

小说人物相关性

import networkx as nx
import matplotlib.pyplot as plt
import jieba
import codecs
import jieba.posseg as pseg

names = {}          
relationships = {}  
lineNames = []      

def build_relationship(dict, novel_name, code):
    # jieba.load_userdict(dict)
    with codecs.open(novel_name, encoding=code) as f:
        for line in f.readlines():
            poss = pseg.cut(line, use_paddle=True)       
            # 分词并返回该词词性
            lineNames.append([])        
            # 为新读入的一段添加人物名称列表
            for w in poss:
                # 当分词长度小于2或该词词性不为nr时认为该词不为人名
                if w.flag != "nr" or len(w.word) < 2:
                    continue   
                # 为当前段的环境增加一个人物         
                lineNames[-1].append(w.word)        
                if names.get(w.word) is None:
                    names[w.word] = 0
                    relationships[w.word] = {}
                names[w.word] += 1   

    # 对于每一段
    for line in lineNames:                  
        # 每段中的任意两个人
        for name1 in line:                  
            for name2 in line:              
                if name1 == name2:
                    continue
                # 若两人尚未同时出现则新建项
                if relationships[name1].get(name2) is None:     
                    relationships[name1][name2]= 1
                # 两人共同出现次数加 1
                else:
                    relationships[name1][name2] = relationships[name1][name2]+ 1        

def write_file():
    with codecs.open("tlbb_relationship.txt", "a+", "utf-8") as f:
        for name, edges in relationships.items():
            for v, w in edges.items():
                if w > 9:
                    f.write(name + " " + v + " " + str(w) + "
")

build_relationship("tlbb_names.txt", "天龙八部.txt", "gb18030")
write_file()
a = []
f = open("tlbb_relationship.txt")
line = f.readline()
while line:
    if not line.isspace():
        #保存文件是以空格分离的
        a.append(line.split())   
    line = f.readline()
f.close()

G = nx.Graph()
G.add_weighted_edges_from(a)
nx.draw(G,with_labels=True,font_size=9,node_size=800,node_color="r")
plt.rcParams["font.sans-serif"] = ["Arial Unicode MS"]
plt.show()

B站视频排行

import requests 
from bs4 import BeautifulSoup 
import csv 
import pandas as pd 
 
url = "https://www.bilibili.com/v/popular/rank/all"
page = requests.get(url) 
soup = BeautifulSoup(page.content, "html.parser") 
 
all_products = [] 
 
products = soup.select("li.rank-item") 
for product in products: 
    rank = product.select("div.num")[0].text 
    name = product.select("div.info > a")[0].text.strip() 
    play = product.select("span.data-box")[0].text 
    comment = product.select("span.data-box")[1].text 
    up = product.select("span.data-box")[2].text 
    url = product.select("div.info > a")[0].attrs["href"] 
 
    all_products.append({ 
        "视频排名":rank, 
        "视频名": name, 
        "播放量": play, 
        "弹幕量": comment, 
        "up主": up, 
        "视频链接": url 
    }) 
 
 
keys = all_products[0].keys() 
 
with open("B站视频热榜TOP100.csv", "w", newline="", encoding="utf-8-sig") as output_file: 
    dict_writer = csv.DictWriter(output_file, keys) 
    dict_writer.writeheader() 
    dict_writer.writerows(all_products) 
 
pd.DataFrame(all_products,columns=keys).to_csv("B站视频热榜TOP100.csv", encoding="utf-8-sig")

豆瓣电影

from unicodedata import name
import requests
from bs4 import BeautifulSoup
import pandas as pd
from wordcloud import WordCloud
import jieba
import re
import math
import random
import time
import collections

def getfilminfo(url,headers,name):
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    r.encoding = "utf-8"
    soup = BeautifulSoup(r.text, "html.parser")
    comments = soup.find_all(attrs={"class": "short"})
    with open(name + ".txt", "a", encoding="utf-8") as f:
        f.write(str(comments))

def all_comments(url, headers):
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    r.encoding = "utf-8"
    soup = BeautifulSoup(r.text, "html.parser")
    number = soup.find(attrs={"class": "is-active"})
    p1 = re.compile(r"[(](.*?)[)]", re.S) 
    number = re.findall(p1, str(number))
    return number

def load_data(num, name):
    url = "https://movie.douban.com/subject/" + num + "/comments?start=0&status=F&sort=new_score"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
        "Host":"movie.douban.com",
        "Origin":"movie.douban.com",
        "Cookie": "*********"
    }
    number = all_comments(url, headers)
    pages = math.ceil(int(number[0])/20)
    for i in range(pages):
        url = "https://movie.douban.com/subject/" + num + "/comments?start={}&status=F&sort=new_score".format(i * 20)
        time.sleep(random.randint(1,5))
        getfilminfo(url, headers, name)

def draw_word_cloud(path):
    with open(path) as fp:
        text = fp.read()
        seg_list_words = jieba.cut(text)
        obj_list = []
        remove_words = [line.strip() for line in open("/Users/universe/Documents/python/Excel基础操作/词云/stopWords.txt", encoding="UTF-8").readlines()]
        for word in seg_list_words:
            if word not in remove_words:
                obj_list.append(word)
        word_counts = collections.Counter(obj_list)
        wc = WordCloud(
            font_path="/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
            max_words=50,
            max_font_size=100,
            mode="RGBA"
        ).generate_from_frequencies(word_counts)
        image_pro = wc.to_image()
        image_pro.show()

draw_word_cloud("狙击手.txt")
draw_word_cloud("长津湖之水门桥.txt")

小说人物词云

from email.mime import image
from wordcloud import WordCloud
import jieba
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image as image 

file_name = "天龙八部.txt"
with open(file_name, encoding="gb18030") as f:
    txt = f.read()
cut = ""
remove_words = [line.strip() for line in open("/Users/universe/Documents/python/Excel基础操作/词云/stopWords.txt", encoding="UTF-8").readlines()]
for item in jieba.cut(txt):
    if item == "萧峰":
        item = "乔峰"
    if item not in remove_words:
        cut += (item + " ")

mask = np.array(image.open("kongfu.jpeg"))
wordcloud = WordCloud(
                mask=mask,
                font_path= "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
                background_color="white",
            ).generate(cut)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

豆瓣Top250电影

import os
import re
import time
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook, load_workbook
import pandas as pd
import random
from pyecharts import options as opts
from pyecharts.charts import Bar

def getonepagelist(url,headers):
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
        r.encoding = "utf-8"
        soup = BeautifulSoup(r.text, "html.parser")
        lsts = soup.find_all(attrs={"class": "hd"})
        for lst in lsts:
            href = lst.a["href"]
            time.sleep(random.randint(1,5))
            getfilminfo(href, headers)
    except:
        print("getonepagelist error!")

def getfilminfo(url,headers):
    filminfo = []
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()
    r.encoding = "utf-8"
    soup = BeautifulSoup(r.text, "html.parser")
    # 片名
    name = soup.find(attrs={"property": "v:itemreviewed"}).text.split(" ")[0]
    # 上映年份
    year = soup.find(attrs={"class": "year"}).text.replace("(","").replace(")","")
    # 评分
    score = soup.find(attrs={"property": "v:average"}).text
    # 评价人数
    votes = soup.find(attrs={"property": "v:votes"}).text
    infos = soup.find(attrs={"id": "info"}).text.split("
")[1:11]
    # 导演
    director = infos[0].split(": ")[1]
    # 编剧
    scriptwriter = infos[1].split(": ")[1]
    # 主演
    actor = infos[2].split(": ")[1]
    # 类型
    filmtype = infos[3].split(": ")[1]
    # 国家/地区
    area = infos[4].split(": ")[1]
    if "." in area:
        area = infos[5].split(": ")[1].split(" / ")[0]
        # 语言
        language = infos[6].split(": ")[1].split(" / ")[0]
    else:
        area = infos[4].split(": ")[1].split(" / ")[0]
        # 语言
        language = infos[5].split(": ")[1].split(" / ")[0]

    if "大陆" in area or "香港" in area or "台湾" in area:
        area = "中国"
    if "戛纳" in area:
        area = "法国"
    # 时长
    times0 = soup.find(attrs={"property": "v:runtime"}).text
    times = re.findall("d+", times0)[0]
    filminfo.append(name)
    filminfo.append(year)
    filminfo.append(score)
    filminfo.append(votes)
    filminfo.append(director)
    filminfo.append(scriptwriter)
    filminfo.append(actor)
    filminfo.append(filmtype)
    filminfo.append(area)
    filminfo.append(language)
    filminfo.append(times)
    filepath = "./TOP250.xlsx"
    insert2excel(filepath,filminfo)

def insert2excel(filepath,allinfo):
    try:
        if not os.path.exists(filepath):
            tableTitle = ["片名","上映年份","评分","评价人数","导演","编剧","主演","类型","国家/地区","语言","时长(分钟)"]
            wb = Workbook()
            ws = wb.active
            ws.title = "sheet1"
            ws.append(tableTitle)
            wb.save(filepath)
        wb = load_workbook(filepath)
        ws = wb.active
        ws.title = "sheet1"
        ws.append(allinfo)
        wb.save(filepath)
        return True
    except:
        return False

def load_data():
    for i in range(11):
        print(f"正在爬取第{i}页,请稍等...")
        url = "https://movie.douban.com/top250?start={}&filter=".format(i * 25)
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
            "Host":"movie.douban.com",
            "Origin":"movie.douban.com",
            "Cookie": ""
        }
        getonepagelist(url, headers)

data = pd.read_excel("TOP250.xlsx")

def getzoombar(data):
    year_counts = data["上映年份"].value_counts()
    year_counts.columns = ["上映年份", "数量"]
    year_counts = year_counts.sort_index()
    c = (
        Bar()
        .add_xaxis(list(year_counts.index))
        .add_yaxis("上映数量", year_counts.values.tolist())
        .set_global_opts(
            title_opts=opts.TitleOpts(title="各年份上映电影数量"),
            yaxis_opts=opts.AxisOpts(name="上映数量"),
            xaxis_opts=opts.AxisOpts(name="上映年份"),
            datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")],)
        .render("各年份上映电影数量.html")
        )

def getscorebar(data):
    df = data.sort_values(by="评价人数", ascending=True)
    c = (
        Bar()
        .add_xaxis(df["片名"].values.tolist()[-20:])
        .add_yaxis("评价人数", df["评价人数"].values.tolist()[-20:])
        .reversal_axis()
        .set_global_opts(
            title_opts=opts.TitleOpts(title="电影评价人数"),
            yaxis_opts=opts.AxisOpts(name="片名"),
            xaxis_opts=opts.AxisOpts(name="人数"),
            datazoom_opts=opts.DataZoomOpts(type_="inside"),
            )
        .set_series_opts(label_opts=opts.LabelOpts(position="right"))
        .render("电影评价人数前二十.html")
        )

def getcountrybar(data):
    country_counts = data["国家/地区"].value_counts()
    country_counts.columns = ["国家/地区", "数量"]
    country_counts = country_counts.sort_values(ascending=True)
    c = (
        Bar()
        .add_xaxis(list(country_counts.index)[-10:])
        .add_yaxis("地区上映数量", country_counts.values.tolist()[-10:])
        .reversal_axis()
        .set_global_opts(
        title_opts=opts.TitleOpts(title="地区上映电影数量"),
        yaxis_opts=opts.AxisOpts(name="国家/地区"),
        xaxis_opts=opts.AxisOpts(name="上映数量"),
        )
        .set_series_opts(label_opts=opts.LabelOpts(position="right"))
        .render("各地区上映电影数量前十.html")
        )


getcountrybar(data)
getscorebar(data)

城市温度爬取

import requests
from bs4 import BeautifulSoup

def get_weather_content():
    url = "https://weather.cma.cn/"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html5lib")
    return soup
    
def get_hot_cities():
    soup = get_weather_content()
    hot_cities = []
    ranks = soup.select("div.hb")[0].select("div.tab-content")[0]
    hot_rank = ranks.find_all(id="HOT")[0]
    hot_links = hot_rank.select("a")
    for link in hot_links:
        rank = link.select("div.rank")[0].get_text(separator=" ", strip="True")
        city = link.select("div.sname")[0].get_text(separator=" ", strip="True")
        province = link.select("div.pname")[0].get_text(separator=" ", strip="True")
        temporature = link.select("div.value")[0].get_text(separator=" ", strip="True")
        if rank != "排名":
            hot_cities.append({
                "排名": rank,
                "城市":city,
                "省市自治区": province,
                "温度": temporature,
            })
    return hot_cities