First commmit
This commit is contained in:
1
packages/document-converter/.python-version
Normal file
1
packages/document-converter/.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.10
|
||||
0
packages/document-converter/README.md
Normal file
0
packages/document-converter/README.md
Normal file
20
packages/document-converter/pyproject.toml
Normal file
20
packages/document-converter/pyproject.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[project]
|
||||
name = "document-converter"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name = "Anibal Angulo", email = "a8065384@banorte.com" }
|
||||
]
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"markitdown[pdf]>=0.1.2",
|
||||
"pypdf>=6.1.2",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
convert-md = "document_converter.markdown:app"
|
||||
|
||||
[build-system]
|
||||
requires = ["uv_build>=0.8.3,<0.9.0"]
|
||||
build-backend = "uv_build"
|
||||
@@ -0,0 +1,2 @@
|
||||
def hello() -> str:
|
||||
return "Hello from document-converter!"
|
||||
35
packages/document-converter/src/document_converter/base.py
Normal file
35
packages/document-converter/src/document_converter/base.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
|
||||
class BaseConverter(ABC):
|
||||
"""
|
||||
Abstract base class for a remote file processor.
|
||||
|
||||
This class defines the interface for listing and processing files from a remote source.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def process_file(self, file: str) -> str:
|
||||
"""
|
||||
Processes a single file from a remote source and returns the result.
|
||||
|
||||
Args:
|
||||
file: The path to the file to be processed from the remote source.
|
||||
|
||||
Returns:
|
||||
A string containing the processing result for the file.
|
||||
"""
|
||||
...
|
||||
|
||||
def process_files(self, files: List[str]) -> List[str]:
|
||||
"""
|
||||
Processes a list of files from a remote source and returns the results.
|
||||
|
||||
Args:
|
||||
files: A list of file paths to be processed from the remote source.
|
||||
|
||||
Returns:
|
||||
A list of strings containing the processing results for each file.
|
||||
"""
|
||||
return [self.process_file(file) for file in files]
|
||||
131
packages/document-converter/src/document_converter/markdown.py
Normal file
131
packages/document-converter/src/document_converter/markdown.py
Normal file
@@ -0,0 +1,131 @@
|
||||
from pathlib import Path
|
||||
from typing import Annotated, BinaryIO, Union
|
||||
|
||||
import typer
|
||||
from markitdown import MarkItDown
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress
|
||||
|
||||
from .base import BaseConverter
|
||||
|
||||
|
||||
class MarkdownConverter(BaseConverter):
|
||||
"""Converts PDF documents to Markdown format."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initializes the MarkItDown converter."""
|
||||
self.markitdown = MarkItDown(enable_plugins=False)
|
||||
|
||||
def process_file(self, file_stream: Union[str, Path, BinaryIO]) -> str:
|
||||
"""
|
||||
Processes a single file and returns the result as a markdown string.
|
||||
|
||||
Args:
|
||||
file_stream: A file path (string or Path) or a binary file stream.
|
||||
|
||||
Returns:
|
||||
The converted markdown content as a string.
|
||||
"""
|
||||
result = self.markitdown.convert(file_stream)
|
||||
return result.text_content
|
||||
|
||||
|
||||
# --- CLI Application ---
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_path: Annotated[
|
||||
Path,
|
||||
typer.Argument(
|
||||
help="Path to the input PDF file or directory.",
|
||||
exists=True,
|
||||
file_okay=True,
|
||||
dir_okay=True,
|
||||
readable=True,
|
||||
resolve_path=True,
|
||||
),
|
||||
],
|
||||
output_path: Annotated[
|
||||
Path,
|
||||
typer.Argument(
|
||||
help="Path for the output Markdown file or directory.",
|
||||
file_okay=True,
|
||||
dir_okay=True,
|
||||
writable=True,
|
||||
resolve_path=True,
|
||||
),
|
||||
],
|
||||
):
|
||||
"""
|
||||
Converts a PDF file or a directory of PDF files into Markdown.
|
||||
"""
|
||||
console = Console()
|
||||
converter = MarkdownConverter()
|
||||
|
||||
if input_path.is_dir():
|
||||
# --- Directory Processing ---
|
||||
console.print(f"[bold green]Processing directory:[/bold green] {input_path}")
|
||||
output_dir = output_path
|
||||
|
||||
if output_dir.exists() and not output_dir.is_dir():
|
||||
console.print(
|
||||
f"[bold red]Error:[/bold red] Input is a directory, but output path '{output_dir}' is an existing file."
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
pdf_files = sorted(list(input_path.rglob("*.pdf")))
|
||||
if not pdf_files:
|
||||
console.print("[yellow]No PDF files found in the input directory.[/yellow]")
|
||||
return
|
||||
|
||||
console.print(f"Found {len(pdf_files)} PDF files to convert.")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with Progress(console=console) as progress:
|
||||
task = progress.add_task("[cyan]Converting...", total=len(pdf_files))
|
||||
for pdf_file in pdf_files:
|
||||
relative_path = pdf_file.relative_to(input_path)
|
||||
output_md_path = output_dir.joinpath(relative_path).with_suffix(".md")
|
||||
output_md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
progress.update(task, description=f"Processing {pdf_file.name}")
|
||||
try:
|
||||
markdown_content = converter.process_file(pdf_file)
|
||||
output_md_path.write_text(markdown_content, encoding="utf-8")
|
||||
except Exception as e:
|
||||
console.print(
|
||||
f"\n[bold red]Failed to process {pdf_file.name}:[/bold red] {e}"
|
||||
)
|
||||
progress.advance(task)
|
||||
|
||||
console.print(
|
||||
f"[bold green]Conversion complete.[/bold green] Output directory: {output_dir}"
|
||||
)
|
||||
|
||||
elif input_path.is_file():
|
||||
# --- Single File Processing ---
|
||||
console.print(f"[bold green]Processing file:[/bold green] {input_path.name}")
|
||||
final_output_path = output_path
|
||||
|
||||
# If output path is a directory, create a file inside it
|
||||
if output_path.is_dir():
|
||||
final_output_path = output_path / input_path.with_suffix(".md").name
|
||||
|
||||
final_output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
markdown_content = converter.process_file(input_path)
|
||||
final_output_path.write_text(markdown_content, encoding="utf-8")
|
||||
console.print(
|
||||
f"[bold green]Successfully converted file to:[/bold green] {final_output_path}"
|
||||
)
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error processing file:[/bold red] {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
Reference in New Issue
Block a user