Ctrl K

Extract text from PDF and return it as txt file

Published Apr 17, 2024

This script takes a PDF file in bytes format as input and extracts all the text from it. It then encodes this extracted text into a base64 string to ensure safe transmission or storage. Finally, it returns a dictionary containing the encoded text and a filename, suggesting the text represents the content of the original PDF file.

Script windmill

Submitted by henri186 Python3

Created 801 days ago

All edits

Permalink

import io
import base64
from PyPDF2 import PdfReader

def main(pdf: bytes) -> dict:
    # Create a PdfReader instance
    reader = PdfReader(io.BytesIO(pdf))
    
    # Initialize an empty string to collect all the text
    full_text = ""
    
    # Iterate through all the pages and extract text
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n"  # Add a newline character to separate pages
    
    # Encode the full text to a byte stream
    encoded_text = base64.b64encode(full_text.encode('utf-8')).decode('utf-8')
    
    # Return the file content and filename in the desired format
    return {
        "file": {
            "content": encoded_text,
            "filename": "content.txt"
        }
    }

`1`	`import io`
`2`	`import base64`
`3`	`from PyPDF2 import PdfReader`
`4`
`5`	`def main(pdf: bytes) -> dict:`
`6`	`# Create a PdfReader instance`
`7`	`reader = PdfReader(io.BytesIO(pdf))`
`8`
`9`	`# Initialize an empty string to collect all the text`
`10`	`full_text = ""`
`11`
`12`	`# Iterate through all the pages and extract text`
`13`	`for page in reader.pages:`
`14`	`page_text = page.extract_text()`
`15`	`if page_text:`
`16`	`full_text += page_text + "\n" # Add a newline character to separate pages`
`17`
`18`	`# Encode the full text to a byte stream`
`19`	`encoded_text = base64.b64encode(full_text.encode('utf-8')).decode('utf-8')`
`20`
`21`	`# Return the file content and filename in the desired format`
`22`	`return {`
`23`	`"file": {`
`24`	`"content": encoded_text,`
`25`	`"filename": "content.txt"`
`26`	`}`
`27`	`}`