Edits history of script submission #6063 for ' Extract text from PDF and return it as txt file (windmill)'

  • python3
    import io
    import base64
    from PyPDF2 import PdfReader
    
    def main(pdf: bytes) -> dict:
        # Create a PdfReader instance
        reader = PdfReader(io.BytesIO(pdf))
        
        # Initialize an empty string to collect all the text
        full_text = ""
        
        # Iterate through all the pages and extract text
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"  # Add a newline character to separate pages
        
        # Encode the full text to a byte stream
        encoded_text = base64.b64encode(full_text.encode('utf-8')).decode('utf-8')
        
        # Return the file content and filename in the desired format
        return {
            "file": {
                "content": encoded_text,
                "filename": "content.txt"
            }
        }

    Submitted by henri186 771 days ago