词云
import re # 正则表达式库 import collections # 词频统计库 import numpy as np # numpy数据处理库 import jieba # 结巴分词 import wordcloud # 词云展示库 from PIL import Image # 图像处理库 import matplotlib.pyplot as plt # 图像展示库 # 读取文件 fn = open("./《呐喊》全集.txt") # 打开文件 string_data = fn.read() # 读出整个文件 fn.close() # 关闭文件 # 文本预处理 pattern = re.compile(u" | |.|-|:|;|)|(|?|") # 定义正则表达式匹配模式 string_data = re.sub(pattern, "", string_data) # 将符合模式的字符去除 # 文本分词 seg_list_exact = jieba.cut(string_data, cut_all = False) # 精确模式分词 object_list = [] remove_words = [line.strip() for line in open("./stopWords.txt", encoding="UTF-8").readlines()] for word in seg_list_exact: # 循环读出每个分词 if word not in remove_words: # 如果不在去除词库中 object_list.append(word) # 分词追加到列表 # 词频统计 word_counts = collections.Counter(object_list) # 对分词做词频统计 word_counts_top10 = word_counts.most_common(500) # 获取前10最高频的词 # 词频展示 mask = np.array(Image.open("./bg.png")) # 定义词频背景 wc = wordcloud.WordCloud( font_path="/System/Library/Fonts/Supplemental/Arial Unicode.ttf", mask=mask, # 设置背景图 max_words=50, # 最多显示词数 max_font_size=100, # 字体最大值 mode="RGBA" ) wc.generate_from_frequencies(word_counts) # 从字典生成词云 image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案 wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案 wc.to_file("./cloudResult.png") plt.imshow(wc) # 显示词云 plt.axis("off") # 关闭坐标轴 plt.show() # 显示图像
PPT自动填充
from openpyxl import load_workbook from pptx import Presentation workbook = load_workbook("list.xlsx") sheet = workbook.active for row in sheet.rows: class_id = row[0].value name = row[1].value title = row[2].value pptx = Presentation("temp.pptx") slide = pptx.slides.add_slide(pptx.slide_layouts[0]) slide = pptx.slides.add_slide(pptx.slide_layouts[0]) for shape in slide.placeholders: shape.text = str(shape.placeholder_format.idx) for row in sheet.rows: class_id = row[0].value name = row[1].value title = row[2].value slide = pptx.slides.add_slide(pptx.slide_layouts[0]) slide.placeholders[10].text = f"{class_id} 班 {name} 小朋友:" slide.placeholders[11].text = "在2021-2022学年度第一学期获得" slide.placeholders[12].text = f"“{title}”称号。" slide.placeholders[13].text = "特发此证,以资鼓励。" slide.placeholders[14].text = "太阳市幼儿园" slide.placeholders[15].text = "2022年1月" pptx.save("results.pptx")
Excel增删行
from openpyxl import load_workbook def delColumns(fileName): wb = load_workbook(fileName) ws = wb["Sheet1"] ws.insert_rows(2,2) ws.delete_rows(10) wb.save(fileName) delColumns("./test1.xlsx")
Excel插入图片
import xlwings as xw def addImage(fileName): app = xw.App(visible=True, add_book=False) app.display_alerts = False app.screen_updating = False wb = app.books.open(fileName) sheet = wb.sheets.active sheet.pictures.add(r"/Users/universe/Desktop/1.png", left=sheet.range("J2").left, top=sheet.range("J2").top, width=100, height=100) wb.save() wb.close() app.quit() addImage("./test1.xlsx")
Excel应用公式
from openpyxl import load_workbook def useFormular(fileName): wb =load_workbook(fileName) ws = wb["Sheet1"] ws["H45"] = "=SUM(H2:H44)" wb.save(fileName) useFormular("./test1.xlsx")
Excel分类计算
import datetime from openpyxl import load_workbook import re class GetInformation(object): def __init__(self, id): self.id = id self.birth_year = int(self.id[6:10]) self.birth_month = int(self.id[10:12]) self.birth_day = int(self.id[12:14]) def get_birthday(self): birthday = "{0}-{1}-{2}".format(self.birth_year, self.birth_month, self.birth_day) return birthday def get_sex(self): # 男生:1 女生:0 num = int(self.id[16:17]) if num % 2 == 0: return 0 else: return 1 def get_age(self): now = (datetime.datetime.now() + datetime.timedelta(days=1)) year = now.year month = now.month day = now.day if year == self.birth_year: return 0 else: if self.birth_month > month or (self.birth_month == month and self.birth_day > day): return year - self.birth_year - 1 else: return year - self.birth_year workbook = load_workbook("identification.xlsx") sheet = workbook.active i=2 while i<=25: cell_ai = sheet.cell(row=i,column=2) ai = str(cell_ai.value) id = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])", "", ai) birthday = GetInformation(id).get_birthday() age = GetInformation(id).get_age() ci = sheet.cell(row=i,column=3) ci.value = age i+=1 workbook.save("identification.xlsx")
Excel根据范围自动增添分类
import pandas as pd def exchangeGradeToLevel(point): if point >= 90: return "A" elif point >= 80: return "B" elif point >= 70: return "C" elif point >= 60: return "D" else: return "E" def calToNewColumn(fileName): df = pd.read_excel(fileName) colNames = df.columns.tolist() colNames.insert(3, "等级") df.reindex(columns=colNames) df["等级"] = df["成绩"].apply(exchangeGradeToLevel) df.to_excel(fileName, index=False) calToNewColumn("./grades.xlsx")
Excel根据已有结果计算新行
import pandas as pd def calToNewColumn(fileName): df = pd.read_excel(fileName) colNames = df.columns.tolist() colNames.insert(12, "Total") df.reindex(columns=colNames) df["Total"] = df["Quantity"] * df["Profit"] df.to_excel(fileName) calToNewColumn("./Sample-sales-data-excel.xlsx")
Excel更换字体
import openpyxl from openpyxl.styles import Font, colors def changeFontColor(fileName): wb = openpyxl.load_workbook(fileName) ws = wb["Sheet1"] ws["A2"].font = Font( color="FF0000", size=16, bold=True, italic=True) wb.save(fileName) changeFontColor("./test1.xlsx")
Excel折叠行
from openpyxl import load_workbook def collapseColumnRows(fileName): wb = load_workbook(fileName) ws = wb["Sheet1"] ws.column_dimensions.group("A","D", hidden=True) ws.row_dimensions.group(1,10, hidden=True) wb.save(fileName) collapseColumnRows("./test1.xlsx")
Excel条件查询
import pandas as pd def conditionFile(fileName, sheetName, newFile): data = pd.read_excel(fileName, sheet_name=sheetName) newExcel = pd.ExcelWriter(newFile) newResult = data[(data["Item"]=="Pencil") & (data["Total"]>100.0)] newResult.to_excel(newExcel, sheet_name="newSale", index=False) newExcel.save() conditionFile("./test1.xlsx", "newSale", "./test2.xlsx")
PDF转为Excel
import tabula tabula.convert_into("./data-1.pdf", "convertPDF.csv", output_format="csv", pages="all")
视频下载
import requests from requests.adapters import HTTPAdapter from urllib3.poolmanager import PoolManager import ssl import urllib.request class MyAdapter(HTTPAdapter): def init_poolmanager(self, connections, maxsize, block=False): self.poolmanager = PoolManager(num_pools=connections, maxsize=maxsize, block=block, ssl_version=ssl.PROTOCOL_TLSv1) if __name__ == "__main__": s = requests.Session() s.mount("https://", MyAdapter()) urllib.request.urlretrieve("https://apd-c5ae3aef7f5b4f0fc16245f9606a098a.v.smtcdns.com/vhot2.qqvideo.tc.qq.com/AdNRfBv1S2Cqy8NoleDXGbUDn2w9xIkJhbmI4N6ftZxc/uwMROfz2r57EIaQXGdGnC2ddDmY0gCKxs_YQn4ev8GMVk_hh/svp_50001/shg_2376712853_50001_2167864de25342ddb7f4f9f4da1dc5f3.f622.mp4?sdtfrom=v1010&guid=791a0f0881f18d073fc989d5498ca0fb&vkey=35D4EB6143B5339E4A3A1B5F070119608349CE85C18D18B7FF9E86A0911EBFC6E8F7A16E08DA926FC31A31B8440D1B50195B685B9BEE4A43524EE060BBDBA3F2146636DB2D50942C58F22A88550E8697B29AD2600B517C686C2BE27909F948038709A4940A0D942D7A7AAD48F8010ECCB5EEDE6CF615BFD2A285E68B9B8DD1159AD24125AB1579AB", "test.mov")
Excel计算总价并分类
import pandas as pd def groupBySum(fileName, newFileName): df = pd.read_excel(fileName) result = df.groupby(["Item","Unit Cost"]).sum().reset_index() newExcel = pd.ExcelWriter(newFileName) result.to_excel(newExcel, index=False) newExcel.save() groupBySum("./test1.xlsx", "./result.xlsx")
Excel合并表格
import openpyxl def mergeCells(fileName): wb = openpyxl.load_workbook(fileName) ws = wb["Sheet1"] ws.merge_cells("G2:G8") wb.save(fileName) mergeCells("./test1.xlsx")
Excel合并多个文件
import os import pandas as pd def get_files(path): fs = [] for root, dirs, files in os.walk(path): for file in files: fs.append(os.path.join(root, file)) return fs def merge(oldFilePath, newFileName, newSheetName): files = get_files(oldFilePath) arr = [] for i in files: arr.append(pd.read_excel(i)) writer = pd.ExcelWriter(newFileName) pd.concat(arr).to_excel(writer, newSheetName, index=False) writer.save() merge("./excels", "./merge.xlsx", "newSheet")
Excel依据列明合并
import pandas as pd def merge(oldTable, newTable, mergeColumn, newFileName): df1 = pd.read_excel(oldTable) df2 = pd.read_excel(newTable) data = df1.merge(df2, on=mergeColumn, left_index=False, right_index=False, sort=False) data.to_excel(newFileName, index=False) merge("./1.xlsx", "./11.xlsx", "姓名", "result.xlsx")
Excel移动指定范围数据
from openpyxl import load_workbook def moveRange(fileName): wb =load_workbook(fileName) ws = wb["Sheet1"] ws.move_range("E2:H5", rows=3, cols=7) wb.save(fileName) moveRange("./test1.xlsx")
PDF转位Word
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO from docx import Document def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text def valid_xml_char_ordinal(c): codepoint = ord(c) # conditions ordered by presumed frequency return ( 0x20 <= codepoint <= 0xD7FF or codepoint in (0x9, 0xA, 0xD) or 0xE000 <= codepoint <= 0xFFFD or 0x10000 <= codepoint <= 0x10FFFF ) def writeToWord(): document = Document() result = convert_pdf_to_txt("./test.pdf") # for line in str(result).splitlines: cleaned_string = "".join(c for c in result if valid_xml_char_ordinal(c)) document.add_paragraph(cleaned_string) document.save("test.docx") writeToWord()
Excel重命名sheet
import openpyxl def renameSheets(oldFile): wb = openpyxl.load_workbook(oldFile) sheet_names=wb.sheetnames for sheet_name in sheet_names: ws=wb[sheet_name] ws.title=sheet_name.replace("s", "") wb.save(oldFile) renameSheets("./SampleData.xlsx")
Excel写入
import pandas as pd def saveToExcel(oldFile, sheetName, newFile): data = pd.read_excel(oldFile, sheet_name=sheetName) newExcel = pd.ExcelWriter(newFile) data.to_excel(newExcel, sheet_name="newSale", index=False) newExcel.save() saveToExcel("./SampleData.xlsx", "SalesOrders", "./test1.xlsx")
Excel搜索满足条件结果
import pandas as pd def searchSpecificColumns(fileName, newName): df = pd.read_excel(fileName) newFile = pd.ExcelWriter(newName) df.loc[(df["Item"]== "Pencil") & (df["Total"] > 100.0), ["OrderDate","Region","Rep"]].to_excel(newFile) newFile.save() searchSpecificColumns("./test1.xlsx", "./result.xlsx")
Excel设置背景色
import openpyxl from openpyxl.styles import PatternFill, colors def setBackground(fileName): wb = openpyxl.load_workbook(fileName) ws = wb["Sheet1"] ws["A1"].fill = PatternFill("solid", fgColor="1874CD") ws["B2"].fill = PatternFill("lightVertical", fgColor="0000FF") ws["C3"].fill = PatternFill("darkHorizontal", fgColor="00FF00") wb.save(fileName) setBackground("./test1.xlsx")
Excel设置列宽
import openpyxl def setColumnWidth(fileName): wb = openpyxl.load_workbook(fileName) ws = wb["Sheet1"] ws.column_dimensions["B"].width = 50 wb.save(fileName) setColumnWidth("./test1.xlsx")
Excel设置行高
import openpyxl def setRowHeight(fileName): wb = openpyxl.load_workbook(fileName) ws = wb["Sheet1"] ws.row_dimensions[1].height = 100 wb.save(fileName) setRowHeight("./test1.xlsx")
Excel排序
import pandas as pd def sortByValues(fileName, columnName, newFileName): df = pd.read_excel(fileName) newFile = pd.ExcelWriter(newFileName) df.sort_values(by=columnName,ascending=False).to_excel(newFile) newFile.save() sortByValues("./test1.xlsx", "Total", "./ordered.xlsx")
Excel拆分表格
import pandas as pd def splitMultipleExels(oldFile): df = pd.read_excel(oldFile, "SalesOrders") uniValues = df["Item"].unique() for s in uniValues: itemData = df[df["Item"] == s] itemData.to_excel(r"%s.xlsx" % s, index=False) splitMultipleExels("./SampleData.xlsx")
Excel拆分合并项
import openpyxl def mergeCells(fileName): wb = openpyxl.load_workbook(fileName) ws = wb["Sheet1"] ws.unmerge_cells("G2:G8") wb.save(fileName) mergeCells("./test1.xlsx")
Word写入
from docx import Document from docx.shared import Pt from docx.shared import Inches from docx.oxml.ns import qn from docx.shared import RGBColor doc1 = Document() doc1.add_heading("如何使用 Python 创建 Word",0) doc1.add_paragraph("我们平时使用 Word 用来做文章的处理,可能没想过它可以用 Python 生成,下面我们就介绍具体如何操作……") doc1.add_heading("安装 python-docx 库",1) doc1.add_paragraph("现在开始我们来介绍如何安装 python-docx 库,具体需要以下两步操作:") doc1.add_heading("第一步:安装 Python",2) paragraph = doc1.add_paragraph("这是第一步的安装描述!") run = paragraph.add_run("(注意:这里设置了字号为20)") run.font.size = Pt(20) run = doc1.add_paragraph("这里设置中文字体:").add_run("当前字体为黑体") run.font.name="黑体" r = run._element r.rPr.rFonts.set(qn("w:eastAsia"), "黑体") run = doc1.add_paragraph("这段设置:").add_run("文字的是斜体 ") run.italic = True run = doc1.add_paragraph("这段再设置:").add_run("这里设置粗体").bold = True run = doc1.add_paragraph("这段为下划线:").add_run("这里设置带下划线").underline = True run = doc1.add_paragraph("这段字体为红色:").add_run("这里设置字体为红色") run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00) doc1.add_paragraph("这里是我们引用的一段话:人生苦短,我用Python。", style="Intense Quote") doc1.save("VPower.docx")