Reference
Complete API Reference
Comprehensive API reference for all m1f modules and functions
Complete API Reference
This comprehensive reference documents all public APIs, classes, and functions available in the m1f tool suite (v3.4.0).
Table of Contents
m1f Module
The main module for combining multiple files into a single output file.
Module Information
import m1f
# Version information
m1f.__version__ # '3.4.0'
m1f.__version_info__ # (3, 4, 0)
m1f.__author__ # 'Franz und Franz (https://franz.agency)'
m1f.__project__ # 'https://m1f.dev'
Core Classes
FileCombiner
The main orchestrator for file combination operations.
from m1f.core import FileCombiner, ProcessingResult
from m1f.config import Config
from m1f.logging import LoggerManager
class FileCombiner:
"""Main class that orchestrates the file combination process."""
def __init__(self, config: Config, logger_manager: LoggerManager):
"""
Initialize the FileCombiner.
Args:
config: Configuration object with all settings
logger_manager: Logger manager for handling logs
"""
async def run(self) -> ProcessingResult:
"""
Run the file combination process.
Returns:
ProcessingResult with statistics and output paths
"""
ProcessingResult
Data class containing the results of file processing.
@dataclass
class ProcessingResult:
"""Result of the file processing operation."""
files_processed: int # Number of files successfully processed
total_files: int # Total number of files found
execution_time: str # Formatted execution time
output_file: Optional[Path] # Path to the generated output file
archive_file: Optional[Path] # Path to the archive file (if created)
token_count: Optional[int] # Estimated token count (if calculated)
flagged_files: List[str] # Files flagged by security scan
Configuration Classes
Config
The main configuration class that combines all settings.
from m1f.config import Config
@dataclass(frozen=True)
class Config:
"""Main configuration class that combines all settings."""
source_directories: List[Path] # Directories to process
input_file: Optional[Path] # File with list of paths
input_include_files: List[Path] # Additional include files
output: OutputConfig # Output configuration
filter: FilterConfig # File filtering rules
encoding: EncodingConfig # Encoding settings
security: SecurityConfig # Security settings
archive: ArchiveConfig # Archive settings
logging: LoggingConfig # Logging configuration
preset: PresetConfig # Preset configuration
@classmethod
def from_args(cls, args: argparse.Namespace) -> Config:
"""Create configuration from parsed arguments."""
OutputConfig
Configuration for output file generation.
@dataclass(frozen=True)
class OutputConfig:
"""Configuration for output settings."""
output_file: Path # Output file path
add_timestamp: bool = False # Add timestamp to filename
filename_mtime_hash: bool = False # Add mtime hash
force_overwrite: bool = False # Force overwrite existing
minimal_output: bool = False # Skip auxiliary files
skip_output_file: bool = False # Skip main output
separator_style: SeparatorStyle = SeparatorStyle.DETAILED
line_ending: LineEnding = LineEnding.LF
parallel: bool = True # Enable parallel processing
enable_content_deduplication: bool = True # Enable deduplication
FilterConfig
Configuration for file filtering and selection.
@dataclass(frozen=True)
class FilterConfig:
"""Configuration for file filtering."""
exclude_paths: Set[str] # Paths to exclude
exclude_patterns: List[str] # Gitignore patterns
exclude_paths_file: Optional[Union[str, List[str]]]
include_paths_file: Optional[Union[str, List[str]]]
include_patterns: List[str] # Include patterns
include_extensions: Set[str] # File extensions to include
exclude_extensions: Set[str] # File extensions to exclude
docs_only: bool = False # Only documentation files
include_dot_paths: bool = False # Include hidden files
include_binary_files: bool = False # Include binary files
include_symlinks: bool = False # Follow symlinks
no_default_excludes: bool = False # Disable default excludes
max_file_size: Optional[int] = None # Max file size in bytes
remove_scraped_metadata: bool = False # Remove HTML2MD metadata
Enumerations
from m1f.config import SeparatorStyle, LineEnding, ArchiveType, SecurityCheckMode
class SeparatorStyle(Enum):
"""Separator styles between files."""
STANDARD = "Standard" # Simple file path separator
DETAILED = "Detailed" # Full metadata (default)
MARKDOWN = "Markdown" # Markdown formatted
MACHINE_READABLE = "MachineReadable" # JSON metadata blocks
NONE = "None" # No separators
class LineEnding(Enum):
"""Line ending styles."""
LF = "\n" # Unix style (default)
CRLF = "\r\n" # Windows style
class ArchiveType(Enum):
"""Archive file formats."""
ZIP = "zip" # ZIP archive
TAR_GZ = "tar.gz" # Gzipped tar archive
class SecurityCheckMode(Enum):
"""Security check behaviors."""
ABORT = "abort" # Stop on security issues
SKIP = "skip" # Skip flagged files
WARN = "warn" # Include with warning
File Processing
FileProcessor
Handles file discovery, filtering, and reading.
from m1f.file_processor import FileProcessor
class FileProcessor:
"""Handles file discovery and processing."""
def __init__(self, config: Config, logger_manager: LoggerManager):
"""Initialize with configuration and logger."""
async def gather_files(self) -> List[Tuple[Path, str]]:
"""
Gather all files to process based on configuration.
Returns:
List of (absolute_path, relative_path) tuples
"""
async def read_file_content(self, file_path: Path) -> Tuple[str, EncodingInfo]:
"""
Read file content with encoding detection.
Args:
file_path: Path to the file
Returns:
Tuple of (content, encoding_info)
"""
Output and Formatting
OutputWriter
Handles writing the combined output file.
from m1f.output_writer import OutputWriter
class OutputWriter:
"""Handles output file generation."""
async def write_output(
self,
output_path: Path,
processed_files: List[ProcessedFile],
deduplicated_files: Optional[List[Tuple[Path, str]]] = None
) -> int:
"""
Write the combined output file.
Args:
output_path: Path for output file
processed_files: List of processed files with content
deduplicated_files: Files skipped due to duplication
Returns:
Estimated token count
"""
SeparatorGenerator
Generates separators between files based on style.
from m1f.separator_generator import SeparatorGenerator
class SeparatorGenerator:
"""Generates file separators based on style."""
def generate(
self,
file_path: Path,
relative_path: str,
metadata: Dict[str, Any]
) -> str:
"""
Generate separator for a file.
Args:
file_path: Absolute file path
relative_path: Relative path for display
metadata: File metadata (size, mtime, checksum, etc.)
Returns:
Formatted separator string
"""
Security and Encoding
SecurityScanner
Scans files for sensitive information.
from m1f.security_scanner import SecurityScanner
class SecurityScanner:
"""Handles security scanning of files."""
async def scan_files(
self,
files: List[Tuple[Path, str]]
) -> Tuple[List[Tuple[Path, str]], List[str]]:
"""
Scan files for sensitive information.
Args:
files: List of (absolute_path, relative_path) tuples
Returns:
Tuple of (safe_files, flagged_files)
"""
EncodingHandler
Handles file encoding detection and conversion.
from m1f.encoding_handler import EncodingHandler, EncodingInfo
@dataclass
class EncodingInfo:
"""Information about file encoding."""
encoding: str # Detected encoding
confidence: float # Detection confidence (0-1)
bom: Optional[str] # BOM type if present
class EncodingHandler:
"""Handles encoding detection and conversion."""
def detect_encoding(self, file_path: Path) -> EncodingInfo:
"""Detect file encoding with confidence."""
async def read_with_encoding(
self,
file_path: Path,
target_encoding: Optional[str] = None
) -> Tuple[str, EncodingInfo]:
"""Read file with automatic encoding detection."""
Preset System
PresetManager
Manages file processing presets.
from m1f.presets import PresetManager, ProcessingAction, FilePreset
class ProcessingAction(Enum):
"""Available processing actions."""
MINIFY = "minify" # Minify code
STRIP_TAGS = "strip_tags" # Remove HTML tags
STRIP_COMMENTS = "strip_comments" # Remove comments
COMPRESS_WHITESPACE = "compress_whitespace"
REMOVE_EMPTY_LINES = "remove_empty_lines"
CUSTOM = "custom" # Custom processor
@dataclass
class FilePreset:
"""Preset configuration for file types."""
extensions: List[str] # File extensions
actions: List[ProcessingAction] # Processing actions
strip_tags_config: Optional[Dict] # Tag stripping config
custom_processors: List[str] # Custom processors
max_file_size: Optional[int] # Override max size
security_check: Optional[str] # Override security
class PresetManager:
"""Manages preset configurations."""
def load_presets(self, preset_files: List[Path]) -> None:
"""Load preset configurations from files."""
def get_preset_for_file(self, file_path: Path) -> Optional[FilePreset]:
"""Get applicable preset for a file."""
def process_content(
self,
content: str,
file_path: Path,
preset: FilePreset
) -> str:
"""Apply preset processing to content."""
Utilities
Common utility functions used throughout m1f.
from m1f.utils import (
format_duration,
format_file_size,
calculate_checksum,
is_binary_file,
parse_file_size,
validate_path_traversal
)
def format_duration(seconds: float) -> str:
"""Format duration in human-readable format."""
def format_file_size(size_bytes: int) -> str:
"""Format file size in human-readable format."""
def calculate_checksum(content: str) -> str:
"""Calculate SHA256 checksum of content."""
def is_binary_file(file_path: Path) -> bool:
"""Check if file is binary based on content."""
def parse_file_size(size_str: str) -> int:
"""
Parse file size string to bytes.
Examples:
parse_file_size('50KB') -> 51200
parse_file_size('1.5MB') -> 1572864
"""
def validate_path_traversal(path: Path, base_path: Optional[Path] = None) -> Path:
"""Validate path doesn't escape base directory."""
Exceptions
Custom exceptions for error handling.
from m1f.exceptions import (
M1FError,
FileNotFoundError,
PermissionError,
EncodingError,
ConfigurationError,
ValidationError,
SecurityError,
ArchiveError
)
class M1FError(Exception):
"""Base exception for all m1f errors."""
class FileNotFoundError(M1FError):
"""Raised when a required file is not found."""
class PermissionError(M1FError):
"""Raised when lacking permissions to access a file."""
class EncodingError(M1FError):
"""Raised when encoding/decoding fails."""
class ConfigurationError(M1FError):
"""Raised when configuration is invalid."""
class ValidationError(M1FError):
"""Raised when validation fails."""
class SecurityError(M1FError):
"""Raised when security checks fail."""
class ArchiveError(M1FError):
"""Raised when archive operations fail."""
s1f Module
Module for splitting combined files back into individual files.
S1F Core Classes
FileSplitter
Main class for extracting files from combined output.
from s1f.core import FileSplitter
from s1f.config import Config
class FileSplitter:
"""Splits combined files into individual files."""
def __init__(self, config: Config):
"""Initialize with configuration."""
async def extract_files(self) -> ExtractionResult:
"""
Extract files from combined input.
Returns:
ExtractionResult with extraction statistics
"""
Models and Parsers
Data Models
from s1f.models import FileMetadata, ExtractedFile, ExtractionResult
@dataclass
class FileMetadata:
"""Metadata for an extracted file."""
original_path: str # Original file path
size_bytes: int # File size
timestamp: Optional[datetime] # Modification time
checksum: Optional[str] # SHA256 checksum
encoding: Optional[str] # Original encoding
separator_style: str # Detected style
@dataclass
class ExtractedFile:
"""Represents an extracted file."""
metadata: FileMetadata # File metadata
content: str # File content
line_number: int # Starting line in input
@dataclass
class ExtractionResult:
"""Result of file extraction."""
total_files: int # Total files extracted
successful: int # Successfully written
failed: int # Failed extractions
separator_style: str # Detected separator style
execution_time: float # Time taken
Separator Parsers
from s1f.parsers import CombinedFileParser, SeparatorParser
class CombinedFileParser:
"""Main parser that detects and uses appropriate separator parser."""
def detect_separator_style(self, content: str) -> str:
"""Detect the separator style used in the file."""
async def parse(self, content: str) -> List[ExtractedFile]:
"""Parse content and extract files."""
# Available parsers for each separator style
class StandardParser(SeparatorParser):
"""Parser for Standard separator style."""
class DetailedParser(SeparatorParser):
"""Parser for Detailed separator style."""
class MarkdownParser(SeparatorParser):
"""Parser for Markdown separator style."""
class MachineReadableParser(SeparatorParser):
"""Parser for MachineReadable separator style."""
S1F Configuration
from s1f.config import Config
@dataclass
class Config:
"""Configuration for s1f operations."""
input_file: Path # Input combined file
output_dir: Path # Output directory
preserve_structure: bool = True # Maintain directory structure
overwrite: bool = False # Overwrite existing files
dry_run: bool = False # Preview without writing
verify_checksums: bool = True # Verify file checksums
target_encoding: Optional[str] = None # Target encoding for output
verbose: bool = False # Verbose output
html2md Module
Module for converting HTML to Markdown.
Converter API
Main API for HTML to Markdown conversion.
from html2md.api import Html2mdConverter, convert_file, convert_html
class Html2mdConverter:
"""Main converter class for HTML to Markdown."""
def __init__(self, config: Optional[Config] = None):
"""Initialize with optional configuration."""
def convert(
self,
html_content: str,
url: Optional[str] = None,
title: Optional[str] = None
) -> str:
"""
Convert HTML content to Markdown.
Args:
html_content: HTML content to convert
url: Optional source URL for metadata
title: Optional title override
Returns:
Markdown formatted content
"""
def convert_file(self, file_path: Path) -> Path:
"""Convert HTML file to Markdown file."""
def convert_directory(
self,
source_dir: Path,
dest_dir: Path,
recursive: bool = True
) -> List[Path]:
"""Convert all HTML files in directory."""
# Convenience functions
def convert_file(file_path: Union[str, Path], **kwargs) -> Path:
"""Convert single HTML file to Markdown."""
def convert_html(html_content: str, **kwargs) -> str:
"""Convert HTML string to Markdown."""
def convert_url(url: str, destination_dir: Union[str, Path] = ".", **kwargs) -> Path:
"""Download and convert URL to Markdown."""
HTML2MD Configuration
from html2md.config.models import Config, ConversionOptions, ExtractorConfig
@dataclass
class ConversionOptions:
"""Options for HTML to Markdown conversion."""
include_meta: bool = True # Include metadata
preserve_links: bool = True # Keep hyperlinks
code_language: str = "" # Default code language
heading_style: str = "atx" # ATX (#) or Setext
bullet_style: str = "-" # Bullet character
emphasis_style: str = "*" # Emphasis character
strong_style: str = "**" # Strong emphasis
code_block_style: str = "fenced" # Fenced or indented
fence_style: str = "```" # Code fence style
@dataclass
class ExtractorConfig:
"""Configuration for content extraction."""
content_selector: Optional[str] # CSS selector for content
title_selector: Optional[str] # CSS selector for title
remove_selectors: List[str] # Elements to remove
attribute_selectors: Dict[str, str] # Extract attributes
@dataclass
class Config:
"""Main configuration for html2md."""
conversion: ConversionOptions # Conversion options
extractor: ExtractorConfig # Extraction config
output_dir: Path = Path(".") # Output directory
preserve_structure: bool = True # Keep directory structure
add_metadata: bool = True # Add source metadata
Extractors and Processors
Content Extractors
from html2md.extractors import BaseExtractor, DefaultExtractor
class BaseExtractor:
"""Base class for content extractors."""
def extract_content(self, soup: BeautifulSoup) -> str:
"""Extract main content from HTML."""
def extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract page title."""
def extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
"""Extract metadata from HTML."""
class DefaultExtractor(BaseExtractor):
"""Default content extractor with heuristics."""
def __init__(self, config: Optional[ExtractorConfig] = None):
"""Initialize with optional configuration."""
HTML Preprocessors
from html2md.preprocessors import PreprocessingConfig, preprocess_html
@dataclass
class PreprocessingConfig:
"""Configuration for HTML preprocessing."""
fix_encoding: bool = True # Fix encoding issues
clean_whitespace: bool = True # Normalize whitespace
remove_scripts: bool = True # Remove script tags
remove_styles: bool = True # Remove style tags
remove_comments: bool = True # Remove HTML comments
unwrap_divs: bool = False # Unwrap unnecessary divs
fix_lists: bool = True # Fix malformed lists
def preprocess_html(html_content: str, config: PreprocessingConfig) -> str:
"""Preprocess HTML before conversion."""
Usage Examples
Basic m1f Usage
import asyncio
from pathlib import Path
from m1f.config import Config, OutputConfig, FilterConfig
from m1f.core import FileCombiner
from m1f.logging import setup_logging
async def combine_files():
"""Example of programmatic m1f usage."""
# Create configuration
config = Config(
source_directories=[Path("./src")],
input_file=None,
input_include_files=[],
output=OutputConfig(
output_file=Path("combined.txt"),
separator_style=SeparatorStyle.MARKDOWN
),
filter=FilterConfig(
include_extensions={".py", ".md"},
max_file_size=parse_file_size("100KB")
),
# ... other config sections
)
# Setup logging
logger_manager = setup_logging(config)
# Create and run combiner
combiner = FileCombiner(config, logger_manager)
result = await combiner.run()
print(f"Processed {result.files_processed} files")
print(f"Output: {result.output_file}")
# Run the example
asyncio.run(combine_files())
Basic s1f Usage
import asyncio
from pathlib import Path
from s1f.core import FileSplitter
from s1f.config import Config
async def split_files():
"""Example of programmatic s1f usage."""
# Create configuration
config = Config(
input_file=Path("combined.txt"),
output_dir=Path("./extracted"),
preserve_structure=True,
verify_checksums=True
)
# Create and run splitter
splitter = FileSplitter(config)
result = await splitter.extract_files()
print(f"Extracted {result.successful} files")
# Run the example
asyncio.run(split_files())
Basic html2md Usage
from html2md import convert_file, convert_html, Html2mdConverter
from html2md.config.models import Config, ConversionOptions
# Simple conversion
markdown = convert_html("<h1>Hello</h1><p>World</p>")
# File conversion
output_path = convert_file("page.html")
# Advanced usage with configuration
config = Config(
conversion=ConversionOptions(
heading_style="atx",
code_block_style="fenced",
include_meta=True
)
)
converter = Html2mdConverter(config)
markdown = converter.convert(html_content, url="https://example.com")
Error Handling
All modules use consistent error handling patterns:
from m1f.exceptions import M1FError
from s1f.exceptions import S1FError
from html2md.exceptions import Html2mdError
try:
# m1f operations
result = await combiner.run()
except FileNotFoundError as e:
print(f"File not found: {e}")
except SecurityError as e:
print(f"Security issue: {e}")
except M1FError as e:
print(f"M1F error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
Threading and Async
All modules support asynchronous operations for better performance:
- m1f uses async I/O for parallel file processing
- s1f uses async for concurrent file extraction
- html2md supports async for batch conversions
# Enable parallel processing in m1f
config = Config(
output=OutputConfig(
output_file=Path("output.txt"),
parallel=True # Default is True
),
# ...
)
# Async batch operations
async def process_many_files():
tasks = []
for file_path in file_paths:
task = combiner.process_file(file_path)
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
See Also
- Python API Guide - Practical guide for using the Python API
- CLI Reference - Complete command-line reference
- Configuration Guide - Detailed configuration options
- Preset System - File processing presets
- Previous
- Claude Integration