Script 'Extract text from PDF and return it as txt file' for windmill

Extract text from PDF and return it as txt file

This script takes a PDF file in bytes format as input and extracts all the text from it. It then encodes this extracted text into a base64 string to ensure safe transmission or storage. Finally, it returns a dictionary containing the encoded text and a filename, suggesting the text represents the content of the original PDF file.

Created by henri186 481 days ago Viewed 0 times

Submitted by henri186 Python3

Created 481 days ago

All edits

Permalink

import io
import base64
from PyPDF2 import PdfReader

def main(pdf: bytes) -> dict:
    # Create a PdfReader instance
    reader = PdfReader(io.BytesIO(pdf))
    
    # Initialize an empty string to collect all the text
    full_text = ""
    
    # Iterate through all the pages and extract text
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n"  # Add a newline character to separate pages
    
    # Encode the full text to a byte stream
    encoded_text = base64.b64encode(full_text.encode('utf-8')).decode('utf-8')
    
    # Return the file content and filename in the desired format
    return {
        "file": {
            "content": encoded_text,
            "filename": "content.txt"
        }
    }

`1`	`import io`
`2`	`import base64`
`3`	`from PyPDF2 import PdfReader`
`4`
`5`	`def main(pdf: bytes) -> dict:`
`6`	`# Create a PdfReader instance`
`7`	`reader = PdfReader(io.BytesIO(pdf))`
`8`
`9`	`# Initialize an empty string to collect all the text`
`10`	`full_text = ""`
`11`
`12`	`# Iterate through all the pages and extract text`
`13`	`for page in reader.pages:`
`14`	`page_text = page.extract_text()`
`15`	`if page_text:`
`16`	`full_text += page_text + "\n" # Add a newline character to separate pages`
`17`
`18`	`# Encode the full text to a byte stream`
`19`	`encoded_text = base64.b64encode(full_text.encode('utf-8')).decode('utf-8')`
`20`
`21`	`# Return the file content and filename in the desired format`
`22`	`return {`
`23`	`"file": {`
`24`	`"content": encoded_text,`
`25`	`"filename": "content.txt"`
`26`	`}`
`27`	`}`