词云

import re # 正则表达式库
import collections # 词频统计库
import numpy as np # numpy数据处理库
import jieba # 结巴分词
import wordcloud # 词云展示库
from PIL import Image # 图像处理库
import matplotlib.pyplot as plt # 图像展示库

# 读取文件
fn = open("./《呐喊》全集.txt") # 打开文件
string_data = fn.read() # 读出整个文件
fn.close() # 关闭文件

# 文本预处理
pattern = re.compile(u"	|
|.|-|:|;|)|(|?|") # 定义正则表达式匹配模式
string_data = re.sub(pattern, "", string_data) # 将符合模式的字符去除

# 文本分词
seg_list_exact = jieba.cut(string_data, cut_all = False) # 精确模式分词
object_list = []
remove_words = [line.strip() for line in open("./stopWords.txt", encoding="UTF-8").readlines()]

for word in seg_list_exact: # 循环读出每个分词
    if word not in remove_words: # 如果不在去除词库中
        object_list.append(word) # 分词追加到列表

# 词频统计
word_counts = collections.Counter(object_list) # 对分词做词频统计
word_counts_top10 = word_counts.most_common(500) # 获取前10最高频的词

# 词频展示
mask = np.array(Image.open("./bg.png")) # 定义词频背景
wc = wordcloud.WordCloud(
    font_path="/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
    mask=mask, # 设置背景图
    max_words=50, # 最多显示词数
    max_font_size=100, # 字体最大值
    mode="RGBA"
)

wc.generate_from_frequencies(word_counts) # 从字典生成词云
image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案
wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案
wc.to_file("./cloudResult.png")
plt.imshow(wc) # 显示词云
plt.axis("off") # 关闭坐标轴
plt.show() # 显示图像

PPT自动填充

from openpyxl import load_workbook
from pptx import Presentation

workbook = load_workbook("list.xlsx")
sheet = workbook.active

for row in sheet.rows:
    class_id = row[0].value
    name = row[1].value
    title = row[2].value

pptx = Presentation("temp.pptx")
slide = pptx.slides.add_slide(pptx.slide_layouts[0])
slide = pptx.slides.add_slide(pptx.slide_layouts[0])
for shape in slide.placeholders:
    shape.text = str(shape.placeholder_format.idx)

for row in sheet.rows:
    class_id = row[0].value
    name = row[1].value
    title = row[2].value
    slide = pptx.slides.add_slide(pptx.slide_layouts[0])
    slide.placeholders[10].text = f"{class_id} 班 {name} 小朋友:"
    slide.placeholders[11].text = "在2021-2022学年度第一学期获得"
    slide.placeholders[12].text = f"“{title}”称号。"
    slide.placeholders[13].text = "特发此证,以资鼓励。"
    slide.placeholders[14].text = "太阳市幼儿园"
    slide.placeholders[15].text = "2022年1月"

pptx.save("results.pptx")

Excel增删行

from openpyxl import load_workbook

def delColumns(fileName):
    wb = load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.insert_rows(2,2)
    ws.delete_rows(10)
    wb.save(fileName)

delColumns("./test1.xlsx")

Excel插入图片

import xlwings as xw

def addImage(fileName):
    app = xw.App(visible=True, add_book=False)
    app.display_alerts = False
    app.screen_updating = False
    wb = app.books.open(fileName)
    sheet = wb.sheets.active
    sheet.pictures.add(r"/Users/universe/Desktop/1.png", left=sheet.range("J2").left, top=sheet.range("J2").top, width=100, height=100)
    wb.save()
    wb.close() 
    app.quit()

addImage("./test1.xlsx")

Excel应用公式

from openpyxl import load_workbook

def useFormular(fileName):
    wb =load_workbook(fileName)
    ws = wb["Sheet1"]
    ws["H45"] = "=SUM(H2:H44)"
    wb.save(fileName)

useFormular("./test1.xlsx")

Excel分类计算

import datetime
from openpyxl import load_workbook
import re

class GetInformation(object):
    def __init__(self, id):
        self.id = id
        self.birth_year = int(self.id[6:10])
        self.birth_month = int(self.id[10:12])
        self.birth_day = int(self.id[12:14])

    def get_birthday(self):
        birthday = "{0}-{1}-{2}".format(self.birth_year, self.birth_month, self.birth_day)
        return birthday

    def get_sex(self):
        # 男生:1 女生:0
        num = int(self.id[16:17])
        if num % 2 == 0:
            return 0
        else:
            return 1

    def get_age(self):
        now = (datetime.datetime.now() + datetime.timedelta(days=1))
        year = now.year
        month = now.month
        day = now.day

        if year == self.birth_year:
            return 0
        else:
            if self.birth_month > month or (self.birth_month == month and self.birth_day > day):
                return year - self.birth_year - 1
            else:
                return year - self.birth_year


workbook = load_workbook("identification.xlsx")
sheet = workbook.active

i=2
while i<=25:
    cell_ai = sheet.cell(row=i,column=2)
    ai = str(cell_ai.value)
    id = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])", "", ai)
    birthday = GetInformation(id).get_birthday()
    age = GetInformation(id).get_age()
    ci = sheet.cell(row=i,column=3)
    ci.value = age
    i+=1
workbook.save("identification.xlsx")

Excel根据范围自动增添分类

import pandas as pd

def exchangeGradeToLevel(point):
    if point >= 90:
        return "A"
    elif point >= 80:
        return "B"
    elif point >= 70:
        return "C"
    elif point >= 60:
        return "D"
    else:
        return "E"

def calToNewColumn(fileName):
    df = pd.read_excel(fileName)
    colNames = df.columns.tolist()
    colNames.insert(3, "等级")
    df.reindex(columns=colNames)
    df["等级"] = df["成绩"].apply(exchangeGradeToLevel)
    df.to_excel(fileName, index=False)


calToNewColumn("./grades.xlsx")

Excel根据已有结果计算新行

import pandas as pd

def calToNewColumn(fileName):
    df = pd.read_excel(fileName)
    colNames = df.columns.tolist()
    colNames.insert(12, "Total")
    df.reindex(columns=colNames)
    df["Total"] = df["Quantity"] * df["Profit"]
    df.to_excel(fileName)

calToNewColumn("./Sample-sales-data-excel.xlsx")

Excel更换字体

import openpyxl
from openpyxl.styles import Font, colors

def changeFontColor(fileName):
    wb = openpyxl.load_workbook(fileName)
    ws = wb["Sheet1"]
    ws["A2"].font = Font(
        color="FF0000",
        size=16,
        bold=True,
        italic=True)
    wb.save(fileName)

changeFontColor("./test1.xlsx")

Excel折叠行

from openpyxl import load_workbook

def collapseColumnRows(fileName):
    wb = load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.column_dimensions.group("A","D", hidden=True)
    ws.row_dimensions.group(1,10, hidden=True)
    wb.save(fileName)

collapseColumnRows("./test1.xlsx")

Excel条件查询

import pandas as pd

def conditionFile(fileName, sheetName, newFile):
    data = pd.read_excel(fileName, sheet_name=sheetName)
    newExcel = pd.ExcelWriter(newFile)
    newResult = data[(data["Item"]=="Pencil") & (data["Total"]>100.0)]
    newResult.to_excel(newExcel, sheet_name="newSale", index=False) 
    newExcel.save()

conditionFile("./test1.xlsx", "newSale", "./test2.xlsx")

PDF转为Excel

import tabula

tabula.convert_into("./data-1.pdf", "convertPDF.csv", output_format="csv", pages="all")

视频下载

import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
import ssl
import urllib.request
 
class MyAdapter(HTTPAdapter):
    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = PoolManager(num_pools=connections,
                                       maxsize=maxsize,
                                       block=block,
                                       ssl_version=ssl.PROTOCOL_TLSv1)
 
 
if __name__ == "__main__":
    s = requests.Session()
    s.mount("https://", MyAdapter())

urllib.request.urlretrieve("https://apd-c5ae3aef7f5b4f0fc16245f9606a098a.v.smtcdns.com/vhot2.qqvideo.tc.qq.com/AdNRfBv1S2Cqy8NoleDXGbUDn2w9xIkJhbmI4N6ftZxc/uwMROfz2r57EIaQXGdGnC2ddDmY0gCKxs_YQn4ev8GMVk_hh/svp_50001/shg_2376712853_50001_2167864de25342ddb7f4f9f4da1dc5f3.f622.mp4?sdtfrom=v1010&guid=791a0f0881f18d073fc989d5498ca0fb&vkey=35D4EB6143B5339E4A3A1B5F070119608349CE85C18D18B7FF9E86A0911EBFC6E8F7A16E08DA926FC31A31B8440D1B50195B685B9BEE4A43524EE060BBDBA3F2146636DB2D50942C58F22A88550E8697B29AD2600B517C686C2BE27909F948038709A4940A0D942D7A7AAD48F8010ECCB5EEDE6CF615BFD2A285E68B9B8DD1159AD24125AB1579AB", "test.mov")

Excel计算总价并分类

import pandas as pd

def groupBySum(fileName, newFileName):
    df = pd.read_excel(fileName)
    result = df.groupby(["Item","Unit Cost"]).sum().reset_index()
    newExcel = pd.ExcelWriter(newFileName)
    result.to_excel(newExcel, index=False)
    newExcel.save()

groupBySum("./test1.xlsx", "./result.xlsx")

Excel合并表格

import openpyxl

def mergeCells(fileName):
    wb = openpyxl.load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.merge_cells("G2:G8")
    wb.save(fileName)

mergeCells("./test1.xlsx")

Excel合并多个文件

import os
import pandas as pd

def get_files(path):
    fs = []
    for root, dirs, files in os.walk(path):
        for file in files:
            fs.append(os.path.join(root, file))
    return fs

def merge(oldFilePath, newFileName, newSheetName):
    files = get_files(oldFilePath)
    arr = []
    for i in files:
      arr.append(pd.read_excel(i))
    writer = pd.ExcelWriter(newFileName)
    pd.concat(arr).to_excel(writer, newSheetName, index=False)
    writer.save()

merge("./excels", "./merge.xlsx", "newSheet")

Excel依据列明合并

import pandas as pd

def merge(oldTable, newTable, mergeColumn, newFileName):
    df1 = pd.read_excel(oldTable)
    df2 = pd.read_excel(newTable)
    data = df1.merge(df2, on=mergeColumn, left_index=False, right_index=False, sort=False)
    data.to_excel(newFileName, index=False)

merge("./1.xlsx", "./11.xlsx", "姓名", "result.xlsx")

Excel移动指定范围数据

from openpyxl import load_workbook

def moveRange(fileName):
    wb =load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.move_range("E2:H5", rows=3, cols=7)
    wb.save(fileName)

moveRange("./test1.xlsx")

PDF转位Word

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from docx import Document

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

def valid_xml_char_ordinal(c):
    codepoint = ord(c)
    # conditions ordered by presumed frequency
    return (
        0x20 <= codepoint <= 0xD7FF or
        codepoint in (0x9, 0xA, 0xD) or
        0xE000 <= codepoint <= 0xFFFD or
        0x10000 <= codepoint <= 0x10FFFF
        )

def writeToWord():
    document = Document()
    result = convert_pdf_to_txt("./test.pdf")
    # for line in str(result).splitlines:
    cleaned_string = "".join(c for c in result if valid_xml_char_ordinal(c))
    document.add_paragraph(cleaned_string)
    document.save("test.docx")

writeToWord()

Excel重命名sheet

import openpyxl

def renameSheets(oldFile):
    wb = openpyxl.load_workbook(oldFile)
    sheet_names=wb.sheetnames
    for sheet_name in sheet_names:
        ws=wb[sheet_name]
        ws.title=sheet_name.replace("s", "")
    wb.save(oldFile)

renameSheets("./SampleData.xlsx")

Excel写入

import pandas as pd

def saveToExcel(oldFile, sheetName, newFile):
    data = pd.read_excel(oldFile, sheet_name=sheetName)
    newExcel = pd.ExcelWriter(newFile)
    data.to_excel(newExcel, sheet_name="newSale", index=False)
    newExcel.save()


saveToExcel("./SampleData.xlsx", "SalesOrders", "./test1.xlsx")

Excel搜索满足条件结果

import pandas as pd

def searchSpecificColumns(fileName, newName):
    df = pd.read_excel(fileName)
    newFile = pd.ExcelWriter(newName)
    df.loc[(df["Item"]== "Pencil") & (df["Total"] > 100.0), ["OrderDate","Region","Rep"]].to_excel(newFile)
    newFile.save()

searchSpecificColumns("./test1.xlsx", "./result.xlsx")

Excel设置背景色

import openpyxl 
from openpyxl.styles import PatternFill, colors

def setBackground(fileName):
    wb = openpyxl.load_workbook(fileName)
    ws = wb["Sheet1"]
    ws["A1"].fill = PatternFill("solid", fgColor="1874CD")
    ws["B2"].fill = PatternFill("lightVertical", fgColor="0000FF")
    ws["C3"].fill = PatternFill("darkHorizontal", fgColor="00FF00")
    wb.save(fileName)

setBackground("./test1.xlsx")

Excel设置列宽

import openpyxl

def setColumnWidth(fileName):
    wb = openpyxl.load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.column_dimensions["B"].width = 50
    wb.save(fileName)

setColumnWidth("./test1.xlsx")

Excel设置行高

import openpyxl

def setRowHeight(fileName):
    wb = openpyxl.load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.row_dimensions[1].height = 100
    wb.save(fileName)

setRowHeight("./test1.xlsx")

Excel排序

import pandas as pd

def sortByValues(fileName, columnName, newFileName):
    df = pd.read_excel(fileName)
    newFile = pd.ExcelWriter(newFileName)
    df.sort_values(by=columnName,ascending=False).to_excel(newFile)
    newFile.save()

sortByValues("./test1.xlsx", "Total", "./ordered.xlsx")

Excel拆分表格

import pandas as pd

def splitMultipleExels(oldFile):
    df = pd.read_excel(oldFile, "SalesOrders")
    uniValues = df["Item"].unique()
    for s in uniValues:
        itemData = df[df["Item"] == s]
        itemData.to_excel(r"%s.xlsx" % s, index=False)

splitMultipleExels("./SampleData.xlsx")

Excel拆分合并项

import openpyxl

def mergeCells(fileName):
    wb = openpyxl.load_workbook(fileName)
    ws = wb["Sheet1"]
    ws.unmerge_cells("G2:G8")
    wb.save(fileName)

mergeCells("./test1.xlsx")

Word写入

from docx import Document
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.shared import RGBColor

doc1 = Document()
doc1.add_heading("如何使用 Python 创建 Word",0)
doc1.add_paragraph("我们平时使用 Word 用来做文章的处理,可能没想过它可以用 Python 生成,下面我们就介绍具体如何操作……")
doc1.add_heading("安装 python-docx 库",1)
doc1.add_paragraph("现在开始我们来介绍如何安装 python-docx 库,具体需要以下两步操作:")
doc1.add_heading("第一步:安装 Python",2)
paragraph = doc1.add_paragraph("这是第一步的安装描述!")
run = paragraph.add_run("(注意:这里设置了字号为20)")
run.font.size = Pt(20)
run = doc1.add_paragraph("这里设置中文字体:").add_run("当前字体为黑体")
run.font.name="黑体"
r = run._element
r.rPr.rFonts.set(qn("w:eastAsia"), "黑体")
run = doc1.add_paragraph("这段设置:").add_run("文字的是斜体 ")
run.italic = True
run = doc1.add_paragraph("这段再设置:").add_run("这里设置粗体").bold = True
run = doc1.add_paragraph("这段为下划线:").add_run("这里设置带下划线").underline = True
run = doc1.add_paragraph("这段字体为红色:").add_run("这里设置字体为红色")
run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00)
doc1.add_paragraph("这里是我们引用的一段话:人生苦短,我用Python。", style="Intense Quote")
doc1.save("VPower.docx")