Reference

Complete API Reference

Comprehensive API reference for all m1f modules and functions

Complete API Reference

This comprehensive reference documents all public APIs, classes, and functions available in the m1f tool suite (v3.4.0).

Table of Contents

m1f Module

The main module for combining multiple files into a single output file.

Module Information

import m1f

# Version information
m1f.__version__        # '3.4.0'
m1f.__version_info__   # (3, 4, 0)
m1f.__author__        # 'Franz und Franz (https://franz.agency)'
m1f.__project__       # 'https://m1f.dev'

Core Classes

FileCombiner

The main orchestrator for file combination operations.

from m1f.core import FileCombiner, ProcessingResult
from m1f.config import Config
from m1f.logging import LoggerManager

class FileCombiner:
    """Main class that orchestrates the file combination process."""
    
    def __init__(self, config: Config, logger_manager: LoggerManager):
        """
        Initialize the FileCombiner.
        
        Args:
            config: Configuration object with all settings
            logger_manager: Logger manager for handling logs
        """
    
    async def run(self) -> ProcessingResult:
        """
        Run the file combination process.
        
        Returns:
            ProcessingResult with statistics and output paths
        """

ProcessingResult

Data class containing the results of file processing.

@dataclass
class ProcessingResult:
    """Result of the file processing operation."""
    
    files_processed: int           # Number of files successfully processed
    total_files: int              # Total number of files found
    execution_time: str           # Formatted execution time
    output_file: Optional[Path]   # Path to the generated output file
    archive_file: Optional[Path]  # Path to the archive file (if created)
    token_count: Optional[int]    # Estimated token count (if calculated)
    flagged_files: List[str]      # Files flagged by security scan

Configuration Classes

Config

The main configuration class that combines all settings.

from m1f.config import Config

@dataclass(frozen=True)
class Config:
    """Main configuration class that combines all settings."""
    
    source_directories: List[Path]    # Directories to process
    input_file: Optional[Path]       # File with list of paths
    input_include_files: List[Path]  # Additional include files
    output: OutputConfig            # Output configuration
    filter: FilterConfig            # File filtering rules
    encoding: EncodingConfig        # Encoding settings
    security: SecurityConfig        # Security settings
    archive: ArchiveConfig          # Archive settings
    logging: LoggingConfig          # Logging configuration
    preset: PresetConfig            # Preset configuration
    
    @classmethod
    def from_args(cls, args: argparse.Namespace) -> Config:
        """Create configuration from parsed arguments."""

OutputConfig

Configuration for output file generation.

@dataclass(frozen=True)
class OutputConfig:
    """Configuration for output settings."""
    
    output_file: Path                              # Output file path
    add_timestamp: bool = False                    # Add timestamp to filename
    filename_mtime_hash: bool = False              # Add mtime hash
    force_overwrite: bool = False                  # Force overwrite existing
    minimal_output: bool = False                   # Skip auxiliary files
    skip_output_file: bool = False                 # Skip main output
    separator_style: SeparatorStyle = SeparatorStyle.DETAILED
    line_ending: LineEnding = LineEnding.LF
    parallel: bool = True                          # Enable parallel processing
    enable_content_deduplication: bool = True      # Enable deduplication

FilterConfig

Configuration for file filtering and selection.

@dataclass(frozen=True)
class FilterConfig:
    """Configuration for file filtering."""
    
    exclude_paths: Set[str]                        # Paths to exclude
    exclude_patterns: List[str]                    # Gitignore patterns
    exclude_paths_file: Optional[Union[str, List[str]]]
    include_paths_file: Optional[Union[str, List[str]]]
    include_patterns: List[str]                    # Include patterns
    include_extensions: Set[str]                   # File extensions to include
    exclude_extensions: Set[str]                   # File extensions to exclude
    docs_only: bool = False                        # Only documentation files
    include_dot_paths: bool = False                # Include hidden files
    include_binary_files: bool = False             # Include binary files
    include_symlinks: bool = False                 # Follow symlinks
    no_default_excludes: bool = False              # Disable default excludes
    max_file_size: Optional[int] = None            # Max file size in bytes
    remove_scraped_metadata: bool = False          # Remove HTML2MD metadata

Enumerations

from m1f.config import SeparatorStyle, LineEnding, ArchiveType, SecurityCheckMode

class SeparatorStyle(Enum):
    """Separator styles between files."""
    STANDARD = "Standard"              # Simple file path separator
    DETAILED = "Detailed"              # Full metadata (default)
    MARKDOWN = "Markdown"              # Markdown formatted
    MACHINE_READABLE = "MachineReadable"  # JSON metadata blocks
    NONE = "None"                      # No separators

class LineEnding(Enum):
    """Line ending styles."""
    LF = "\n"                          # Unix style (default)
    CRLF = "\r\n"                      # Windows style

class ArchiveType(Enum):
    """Archive file formats."""
    ZIP = "zip"                        # ZIP archive
    TAR_GZ = "tar.gz"                  # Gzipped tar archive

class SecurityCheckMode(Enum):
    """Security check behaviors."""
    ABORT = "abort"                    # Stop on security issues
    SKIP = "skip"                      # Skip flagged files
    WARN = "warn"                      # Include with warning

File Processing

FileProcessor

Handles file discovery, filtering, and reading.

from m1f.file_processor import FileProcessor

class FileProcessor:
    """Handles file discovery and processing."""
    
    def __init__(self, config: Config, logger_manager: LoggerManager):
        """Initialize with configuration and logger."""
    
    async def gather_files(self) -> List[Tuple[Path, str]]:
        """
        Gather all files to process based on configuration.
        
        Returns:
            List of (absolute_path, relative_path) tuples
        """
    
    async def read_file_content(self, file_path: Path) -> Tuple[str, EncodingInfo]:
        """
        Read file content with encoding detection.
        
        Args:
            file_path: Path to the file
            
        Returns:
            Tuple of (content, encoding_info)
        """

Output and Formatting

OutputWriter

Handles writing the combined output file.

from m1f.output_writer import OutputWriter

class OutputWriter:
    """Handles output file generation."""
    
    async def write_output(
        self,
        output_path: Path,
        processed_files: List[ProcessedFile],
        deduplicated_files: Optional[List[Tuple[Path, str]]] = None
    ) -> int:
        """
        Write the combined output file.
        
        Args:
            output_path: Path for output file
            processed_files: List of processed files with content
            deduplicated_files: Files skipped due to duplication
            
        Returns:
            Estimated token count
        """

SeparatorGenerator

Generates separators between files based on style.

from m1f.separator_generator import SeparatorGenerator

class SeparatorGenerator:
    """Generates file separators based on style."""
    
    def generate(
        self,
        file_path: Path,
        relative_path: str,
        metadata: Dict[str, Any]
    ) -> str:
        """
        Generate separator for a file.
        
        Args:
            file_path: Absolute file path
            relative_path: Relative path for display
            metadata: File metadata (size, mtime, checksum, etc.)
            
        Returns:
            Formatted separator string
        """

Security and Encoding

SecurityScanner

Scans files for sensitive information.

from m1f.security_scanner import SecurityScanner

class SecurityScanner:
    """Handles security scanning of files."""
    
    async def scan_files(
        self,
        files: List[Tuple[Path, str]]
    ) -> Tuple[List[Tuple[Path, str]], List[str]]:
        """
        Scan files for sensitive information.
        
        Args:
            files: List of (absolute_path, relative_path) tuples
            
        Returns:
            Tuple of (safe_files, flagged_files)
        """

EncodingHandler

Handles file encoding detection and conversion.

from m1f.encoding_handler import EncodingHandler, EncodingInfo

@dataclass
class EncodingInfo:
    """Information about file encoding."""
    encoding: str              # Detected encoding
    confidence: float          # Detection confidence (0-1)
    bom: Optional[str]         # BOM type if present

class EncodingHandler:
    """Handles encoding detection and conversion."""
    
    def detect_encoding(self, file_path: Path) -> EncodingInfo:
        """Detect file encoding with confidence."""
    
    async def read_with_encoding(
        self, 
        file_path: Path,
        target_encoding: Optional[str] = None
    ) -> Tuple[str, EncodingInfo]:
        """Read file with automatic encoding detection."""

Preset System

PresetManager

Manages file processing presets.

from m1f.presets import PresetManager, ProcessingAction, FilePreset

class ProcessingAction(Enum):
    """Available processing actions."""
    MINIFY = "minify"                      # Minify code
    STRIP_TAGS = "strip_tags"              # Remove HTML tags
    STRIP_COMMENTS = "strip_comments"      # Remove comments
    COMPRESS_WHITESPACE = "compress_whitespace"
    REMOVE_EMPTY_LINES = "remove_empty_lines"
    CUSTOM = "custom"                      # Custom processor

@dataclass
class FilePreset:
    """Preset configuration for file types."""
    extensions: List[str]                  # File extensions
    actions: List[ProcessingAction]        # Processing actions
    strip_tags_config: Optional[Dict]      # Tag stripping config
    custom_processors: List[str]           # Custom processors
    max_file_size: Optional[int]          # Override max size
    security_check: Optional[str]         # Override security

class PresetManager:
    """Manages preset configurations."""
    
    def load_presets(self, preset_files: List[Path]) -> None:
        """Load preset configurations from files."""
    
    def get_preset_for_file(self, file_path: Path) -> Optional[FilePreset]:
        """Get applicable preset for a file."""
    
    def process_content(
        self,
        content: str,
        file_path: Path,
        preset: FilePreset
    ) -> str:
        """Apply preset processing to content."""

Utilities

Common utility functions used throughout m1f.

from m1f.utils import (
    format_duration,
    format_file_size,
    calculate_checksum,
    is_binary_file,
    parse_file_size,
    validate_path_traversal
)

def format_duration(seconds: float) -> str:
    """Format duration in human-readable format."""

def format_file_size(size_bytes: int) -> str:
    """Format file size in human-readable format."""

def calculate_checksum(content: str) -> str:
    """Calculate SHA256 checksum of content."""

def is_binary_file(file_path: Path) -> bool:
    """Check if file is binary based on content."""

def parse_file_size(size_str: str) -> int:
    """
    Parse file size string to bytes.
    
    Examples:
        parse_file_size('50KB') -> 51200
        parse_file_size('1.5MB') -> 1572864
    """

def validate_path_traversal(path: Path, base_path: Optional[Path] = None) -> Path:
    """Validate path doesn't escape base directory."""

Exceptions

Custom exceptions for error handling.

from m1f.exceptions import (
    M1FError,
    FileNotFoundError,
    PermissionError,
    EncodingError,
    ConfigurationError,
    ValidationError,
    SecurityError,
    ArchiveError
)

class M1FError(Exception):
    """Base exception for all m1f errors."""

class FileNotFoundError(M1FError):
    """Raised when a required file is not found."""

class PermissionError(M1FError):
    """Raised when lacking permissions to access a file."""

class EncodingError(M1FError):
    """Raised when encoding/decoding fails."""

class ConfigurationError(M1FError):
    """Raised when configuration is invalid."""

class ValidationError(M1FError):
    """Raised when validation fails."""

class SecurityError(M1FError):
    """Raised when security checks fail."""

class ArchiveError(M1FError):
    """Raised when archive operations fail."""

s1f Module

Module for splitting combined files back into individual files.

S1F Core Classes

FileSplitter

Main class for extracting files from combined output.

from s1f.core import FileSplitter
from s1f.config import Config

class FileSplitter:
    """Splits combined files into individual files."""
    
    def __init__(self, config: Config):
        """Initialize with configuration."""
    
    async def extract_files(self) -> ExtractionResult:
        """
        Extract files from combined input.
        
        Returns:
            ExtractionResult with extraction statistics
        """

Models and Parsers

Data Models

from s1f.models import FileMetadata, ExtractedFile, ExtractionResult

@dataclass
class FileMetadata:
    """Metadata for an extracted file."""
    original_path: str                # Original file path
    size_bytes: int                   # File size
    timestamp: Optional[datetime]     # Modification time
    checksum: Optional[str]          # SHA256 checksum
    encoding: Optional[str]          # Original encoding
    separator_style: str             # Detected style

@dataclass
class ExtractedFile:
    """Represents an extracted file."""
    metadata: FileMetadata           # File metadata
    content: str                     # File content
    line_number: int                # Starting line in input

@dataclass
class ExtractionResult:
    """Result of file extraction."""
    total_files: int                # Total files extracted
    successful: int                 # Successfully written
    failed: int                     # Failed extractions
    separator_style: str            # Detected separator style
    execution_time: float           # Time taken

Separator Parsers

from s1f.parsers import CombinedFileParser, SeparatorParser

class CombinedFileParser:
    """Main parser that detects and uses appropriate separator parser."""
    
    def detect_separator_style(self, content: str) -> str:
        """Detect the separator style used in the file."""
    
    async def parse(self, content: str) -> List[ExtractedFile]:
        """Parse content and extract files."""

# Available parsers for each separator style
class StandardParser(SeparatorParser):
    """Parser for Standard separator style."""

class DetailedParser(SeparatorParser):
    """Parser for Detailed separator style."""

class MarkdownParser(SeparatorParser):
    """Parser for Markdown separator style."""

class MachineReadableParser(SeparatorParser):
    """Parser for MachineReadable separator style."""

S1F Configuration

from s1f.config import Config

@dataclass
class Config:
    """Configuration for s1f operations."""
    
    input_file: Path                         # Input combined file
    output_dir: Path                         # Output directory
    preserve_structure: bool = True          # Maintain directory structure
    overwrite: bool = False                 # Overwrite existing files
    dry_run: bool = False                   # Preview without writing
    verify_checksums: bool = True           # Verify file checksums
    target_encoding: Optional[str] = None    # Target encoding for output
    verbose: bool = False                   # Verbose output

html2md Module

Module for converting HTML to Markdown.

Converter API

Main API for HTML to Markdown conversion.

from html2md.api import Html2mdConverter, convert_file, convert_html

class Html2mdConverter:
    """Main converter class for HTML to Markdown."""
    
    def __init__(self, config: Optional[Config] = None):
        """Initialize with optional configuration."""
    
    def convert(
        self,
        html_content: str,
        url: Optional[str] = None,
        title: Optional[str] = None
    ) -> str:
        """
        Convert HTML content to Markdown.
        
        Args:
            html_content: HTML content to convert
            url: Optional source URL for metadata
            title: Optional title override
            
        Returns:
            Markdown formatted content
        """
    
    def convert_file(self, file_path: Path) -> Path:
        """Convert HTML file to Markdown file."""
    
    def convert_directory(
        self,
        source_dir: Path,
        dest_dir: Path,
        recursive: bool = True
    ) -> List[Path]:
        """Convert all HTML files in directory."""

# Convenience functions
def convert_file(file_path: Union[str, Path], **kwargs) -> Path:
    """Convert single HTML file to Markdown."""

def convert_html(html_content: str, **kwargs) -> str:
    """Convert HTML string to Markdown."""

def convert_url(url: str, destination_dir: Union[str, Path] = ".", **kwargs) -> Path:
    """Download and convert URL to Markdown."""

HTML2MD Configuration

from html2md.config.models import Config, ConversionOptions, ExtractorConfig

@dataclass
class ConversionOptions:
    """Options for HTML to Markdown conversion."""
    
    include_meta: bool = True              # Include metadata
    preserve_links: bool = True            # Keep hyperlinks
    code_language: str = ""                # Default code language
    heading_style: str = "atx"             # ATX (#) or Setext
    bullet_style: str = "-"                # Bullet character
    emphasis_style: str = "*"              # Emphasis character
    strong_style: str = "**"               # Strong emphasis
    code_block_style: str = "fenced"      # Fenced or indented
    fence_style: str = "```"               # Code fence style

@dataclass
class ExtractorConfig:
    """Configuration for content extraction."""
    
    content_selector: Optional[str]        # CSS selector for content
    title_selector: Optional[str]          # CSS selector for title
    remove_selectors: List[str]           # Elements to remove
    attribute_selectors: Dict[str, str]    # Extract attributes

@dataclass
class Config:
    """Main configuration for html2md."""
    
    conversion: ConversionOptions          # Conversion options
    extractor: ExtractorConfig            # Extraction config
    output_dir: Path = Path(".")          # Output directory
    preserve_structure: bool = True        # Keep directory structure
    add_metadata: bool = True             # Add source metadata

Extractors and Processors

Content Extractors

from html2md.extractors import BaseExtractor, DefaultExtractor

class BaseExtractor:
    """Base class for content extractors."""
    
    def extract_content(self, soup: BeautifulSoup) -> str:
        """Extract main content from HTML."""
    
    def extract_title(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract page title."""
    
    def extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
        """Extract metadata from HTML."""

class DefaultExtractor(BaseExtractor):
    """Default content extractor with heuristics."""
    
    def __init__(self, config: Optional[ExtractorConfig] = None):
        """Initialize with optional configuration."""

HTML Preprocessors

from html2md.preprocessors import PreprocessingConfig, preprocess_html

@dataclass
class PreprocessingConfig:
    """Configuration for HTML preprocessing."""
    
    fix_encoding: bool = True              # Fix encoding issues
    clean_whitespace: bool = True          # Normalize whitespace
    remove_scripts: bool = True            # Remove script tags
    remove_styles: bool = True             # Remove style tags
    remove_comments: bool = True           # Remove HTML comments
    unwrap_divs: bool = False             # Unwrap unnecessary divs
    fix_lists: bool = True                # Fix malformed lists

def preprocess_html(html_content: str, config: PreprocessingConfig) -> str:
    """Preprocess HTML before conversion."""

Usage Examples

Basic m1f Usage

import asyncio
from pathlib import Path
from m1f.config import Config, OutputConfig, FilterConfig
from m1f.core import FileCombiner
from m1f.logging import setup_logging

async def combine_files():
    """Example of programmatic m1f usage."""
    
    # Create configuration
    config = Config(
        source_directories=[Path("./src")],
        input_file=None,
        input_include_files=[],
        output=OutputConfig(
            output_file=Path("combined.txt"),
            separator_style=SeparatorStyle.MARKDOWN
        ),
        filter=FilterConfig(
            include_extensions={".py", ".md"},
            max_file_size=parse_file_size("100KB")
        ),
        # ... other config sections
    )
    
    # Setup logging
    logger_manager = setup_logging(config)
    
    # Create and run combiner
    combiner = FileCombiner(config, logger_manager)
    result = await combiner.run()
    
    print(f"Processed {result.files_processed} files")
    print(f"Output: {result.output_file}")

# Run the example
asyncio.run(combine_files())

Basic s1f Usage

import asyncio
from pathlib import Path
from s1f.core import FileSplitter
from s1f.config import Config

async def split_files():
    """Example of programmatic s1f usage."""
    
    # Create configuration
    config = Config(
        input_file=Path("combined.txt"),
        output_dir=Path("./extracted"),
        preserve_structure=True,
        verify_checksums=True
    )
    
    # Create and run splitter
    splitter = FileSplitter(config)
    result = await splitter.extract_files()
    
    print(f"Extracted {result.successful} files")

# Run the example
asyncio.run(split_files())

Basic html2md Usage

from html2md import convert_file, convert_html, Html2mdConverter
from html2md.config.models import Config, ConversionOptions

# Simple conversion
markdown = convert_html("<h1>Hello</h1><p>World</p>")

# File conversion
output_path = convert_file("page.html")

# Advanced usage with configuration
config = Config(
    conversion=ConversionOptions(
        heading_style="atx",
        code_block_style="fenced",
        include_meta=True
    )
)

converter = Html2mdConverter(config)
markdown = converter.convert(html_content, url="https://example.com")

Error Handling

All modules use consistent error handling patterns:

from m1f.exceptions import M1FError
from s1f.exceptions import S1FError
from html2md.exceptions import Html2mdError

try:
    # m1f operations
    result = await combiner.run()
except FileNotFoundError as e:
    print(f"File not found: {e}")
except SecurityError as e:
    print(f"Security issue: {e}")
except M1FError as e:
    print(f"M1F error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Threading and Async

All modules support asynchronous operations for better performance:

  • m1f uses async I/O for parallel file processing
  • s1f uses async for concurrent file extraction
  • html2md supports async for batch conversions
# Enable parallel processing in m1f
config = Config(
    output=OutputConfig(
        output_file=Path("output.txt"),
        parallel=True  # Default is True
    ),
    # ...
)

# Async batch operations
async def process_many_files():
    tasks = []
    for file_path in file_paths:
        task = combiner.process_file(file_path)
        tasks.append(task)
    
    results = await asyncio.gather(*tasks)
    return results

See Also