pptxファイルからテキストと画像を抽出

pptx形式のトーク資料から画像とテキストを抽出したい。

PythonスクリプトをGeminiに書いてもらいました。

code:extract-pptx.py

# /// script

# requires-python = ">=3.12"

# dependencies = [

# "python-pptx",

# ]

# ///

from pptx import Presentation

import os

from PIL import Image

def extract_from_pptx(pptx_path, output_dir):

"""

pptxファイルからテキストと画像を抽出し、指定されたディレクトリに保存します。

Args:

pptx_path (str): pptxファイルのパス。

output_dir (str): 出力ディレクトリのパス。

"""

if not os.path.exists(output_dir):

os.makedirs(output_dir)

presentation = Presentation(pptx_path)

for slide_number, slide in enumerate(presentation.slides):

for shape in slide.shapes:

if shape.has_text_frame:

text = shape.text_frame.text

if text:

text_filename = os.path.join(output_dir, f"text_{shape.shape_id}.txt")

with open(text_filename, "w", encoding="utf-8") as f:

f.write(text)

# shapeがhas_image属性を持っているか確認する

if image := getattr(shape, 'image', None):

image_filename = os.path.join(output_dir, f"image_{shape.shape_id}.{image.ext}")

with open(image_filename, "wb") as f:

f.write(image.blob)

try:

img = Image.open(image_filename)

if image.ext.lower() != "jpg" and image.ext.lower() != "jpeg":

jpg_filename = os.path.splitext(image_filename)0 + ".jpg"

img = img.convert("RGB")

img.save(jpg_filename, "JPEG")

os.remove(image_filename)

except Exception as e:

print(f"画像処理エラー: {e}")

# 使用例

pptx_file = "slide.pptx"

output_directory = "output"

extract_from_pptx(pptx_file, output_directory)

実行

code:shell

uv run extract-pptx.py