diff --git a/configs/config.yaml b/configs/config.yaml index b105ac75..b2a90131 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -11,6 +11,7 @@ paths: generated: "data/generated" # Stage 2: Where generated QA pairs are saved (create output) curated: "data/curated" # Stage 3: Where curated QA pairs are saved (curate output) final: "data/final" # Stage 4: Where final training formats are saved (save-as output) + translated: "data/translated" # Where translated files are saved (translate output) # LLM Provider configuration llm: @@ -173,3 +174,27 @@ prompts: Original conversations: {conversations} + + # Translation prompt + translation: | + The following prompt may have {source_lang} syntax, variable types, and {source_lang}-specific behavior mentioned. Translate the question to the appropriate {target_lang} syntax and replace anything mentioning {source_lang} with {target_lang}. + + Preserve the meaning and intent of the original problem statement while adapting: + - Function signatures and type annotations to {target_lang} conventions + - Variable types (e.g., {source_lang} types to {target_lang} types) + - Language-specific syntax and idioms + - Any mentions of {source_lang} should be replaced with {target_lang} + + Guidelines: + - Use simple, standard {target_lang} types and features + - Avoid advanced {target_lang} features unless necessary + - Maintain the logical structure and problem requirements + - Keep all mathematical formulas and logic unchanged + + Only return the translated question/prompt and do not answer the question or provide code solutions. + + {source_lang} Prompt: + + {prompt} + + {target_lang} Prompt: diff --git a/synthetic_data_kit/cli.py b/synthetic_data_kit/cli.py index e88dbb59..1aa3a36e 100644 --- a/synthetic_data_kit/cli.py +++ b/synthetic_data_kit/cli.py @@ -748,6 +748,161 @@ def save_as( return 1 +@app.command() +def translate( + input: str = typer.Argument(..., help="File or directory to translate"), + source_lang: str = typer.Option( + "Rust", "--source-lang", "-s", help="Source language (e.g., Rust, Python)" + ), + target_lang: str = typer.Option( + "R", "--target-lang", "-t", help="Target language (e.g., R, Python)" + ), + output_dir: Optional[Path] = typer.Option( + None, "--output-dir", "-o", help="Where to save the translated output" + ), + api_base: Optional[str] = typer.Option( + None, "--api-base", help="VLLM API base URL" + ), + model: Optional[str] = typer.Option( + None, "--model", "-m", help="Model to use" + ), + verbose: bool = typer.Option( + False, "--verbose", "-v", help="Show detailed output" + ), + preview: bool = typer.Option( + False, "--preview", help="Preview files to be processed without actually processing them" + ), +): + """ + Translate code/prompts from one language to another using LLM. + + Can process: + - Single file: synthetic-data-kit translate problems.json --source-lang Python --target-lang Rust + - Directory: synthetic-data-kit translate ./json_files/ --source-lang Python --target-lang Rust + + The default translation is from Rust to R, using the specialized template + that avoids complex type annotations and advanced metaprogramming. + + Only JSON files are supported. The file should contain an array of JSON objects, + each with a prompt field (prompt, question, instruction, text, content, or input). + """ + import os + from synthetic_data_kit.core.translate import process_file + from synthetic_data_kit.utils.directory_processor import is_directory, process_directory_translate, get_directory_stats, TRANSLATE_EXTENSIONS + + # Check the LLM provider from config + provider = get_llm_provider(ctx.config) + console.print(f"šŸ”— Using {provider} provider", style="green") + + if provider == "api-endpoint": + # Use API endpoint config + api_endpoint_config = get_openai_config(ctx.config) + api_base = api_base or api_endpoint_config.get("api_base") + model = model or api_endpoint_config.get("model") + # No server check needed for API endpoint + else: + # Use vLLM config + vllm_config = get_vllm_config(ctx.config) + api_base = api_base or vllm_config.get("api_base") + model = model or vllm_config.get("model") + + # Check vLLM server availability + try: + response = requests.get(f"{api_base}/models", timeout=2) + if response.status_code != 200: + console.print(f"āŒ Error: VLLM server not available at {api_base}", style="red") + console.print("Please start the VLLM server with:", style="yellow") + console.print(f"vllm serve {model}", style="bold blue") + return 1 + except requests.exceptions.RequestException: + console.print(f"āŒ Error: VLLM server not available at {api_base}", style="red") + console.print("Please start the VLLM server with:", style="yellow") + console.print(f"vllm serve {model}", style="bold blue") + return 1 + + # Get output directory from args, then config, then default + if output_dir is None: + output_dir = get_path_config(ctx.config, "output", "translated") + + try: + # Check if input is a directory + if is_directory(input): + # Preview mode - show files without processing + if preview: + console.print(f"Preview: scanning directory [bold]{input}[/bold] for translation", style="blue") + stats = get_directory_stats(input, TRANSLATE_EXTENSIONS) + + if "error" in stats: + console.print(f"āŒ {stats['error']}", style="red") + return 1 + + console.print(f"\nšŸ“ Directory: {input}") + console.print(f"šŸ“„ Total files: {stats['total_files']}") + console.print(f"āœ… Supported files: {stats['supported_files']}") + console.print(f"āŒ Unsupported files: {stats['unsupported_files']}") + + if stats['supported_files'] > 0: + console.print(f"\nšŸ“‹ Files that would be translated ({source_lang} -> {target_lang}):") + for ext, count in stats['by_extension'].items(): + console.print(f" {ext}: {count} file(s)") + + console.print(f"\nšŸ“ File list:") + for filename in stats['file_list']: + console.print(f" • {filename}") + + console.print(f"\nšŸ’” To process these files, run:") + console.print(f" synthetic-data-kit translate {input} --source-lang {source_lang} --target-lang {target_lang} --output-dir {output_dir}", style="bold blue") + else: + console.print(f"\nāš ļø No supported files found for translation.", style="yellow") + console.print(f" Looking for: {', '.join(TRANSLATE_EXTENSIONS)}", style="yellow") + + return 0 + + console.print(f"Processing directory: [bold]{input}[/bold] for translation ({source_lang} -> {target_lang})", style="blue") + results = process_directory_translate( + directory=input, + output_dir=output_dir, + config_path=ctx.config_path, + api_base=api_base, + model=model, + source_lang=source_lang, + target_lang=target_lang, + verbose=verbose, + provider=provider + ) + + # Return appropriate exit code + if results["failed"] > 0: + console.print(f"āš ļø Completed with {results['failed']} errors", style="yellow") + return 1 + else: + console.print("āœ… All files translated successfully!", style="green") + return 0 + else: + # Process single file (existing logic) + if preview: + console.print("Preview mode is only available for directories. Processing single file...", style="yellow") + + with console.status(f"Translating {input} from {source_lang} to {target_lang}..."): + output_path = process_file( + input, + output_dir, + source_lang, + target_lang, + ctx.config_path, + api_base, + model, + provider, + verbose + ) + console.print(f"āœ… Translation saved to [bold]{output_path}[/bold]", style="green") + return 0 + + except Exception as e: + console.print(f"āŒ Error: {e}", style="red") + return 1 + + @app.command("server") def server( host: str = typer.Option( diff --git a/synthetic_data_kit/config.yaml b/synthetic_data_kit/config.yaml index 2690697d..6d28057a 100644 --- a/synthetic_data_kit/config.yaml +++ b/synthetic_data_kit/config.yaml @@ -11,6 +11,7 @@ paths: generated: "data/generated" # Stage 2: Where generated QA pairs are saved (create output) curated: "data/curated" # Stage 3: Where curated QA pairs are saved (curate output) final: "data/final" # Stage 4: Where final training formats are saved (save-as output) + translated: "data/translated" # Where translated files are saved (translate output) # LLM Provider configuration llm: @@ -186,3 +187,27 @@ prompts: Original conversations: {conversations} + + # Translation prompt + translation: | + The following prompt may have {source_lang} syntax, variable types, and {source_lang}-specific behavior mentioned. Translate the question to the appropriate {target_lang} syntax and replace anything mentioning {source_lang} with {target_lang}. + + Preserve the meaning and intent of the original problem statement while adapting: + - Function signatures and type annotations to {target_lang} conventions + - Variable types (e.g., {source_lang} types to {target_lang} types) + - Language-specific syntax and idioms + - Any mentions of {source_lang} should be replaced with {target_lang} + + Guidelines: + - Use simple, standard {target_lang} types and features + - Avoid advanced {target_lang} features unless necessary + - Maintain the logical structure and problem requirements + - Keep all mathematical formulas and logic unchanged + + Only return the translated question/prompt and do not answer the question or provide code solutions. + + {source_lang} Prompt: + + {prompt} + + {target_lang} Prompt: diff --git a/synthetic_data_kit/core/translate.py b/synthetic_data_kit/core/translate.py new file mode 100644 index 00000000..20ad0271 --- /dev/null +++ b/synthetic_data_kit/core/translate.py @@ -0,0 +1,224 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +# Translation functionality for code/prompts + +import os +import json +from pathlib import Path +from typing import Optional, Dict, Any + +from synthetic_data_kit.models.llm_client import LLMClient +from synthetic_data_kit.utils.config import get_prompt + + +def read_json_file(file_path: str) -> list: + """Read JSON file and return list of JSON objects""" + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Handle both array of objects and single object + if isinstance(data, list): + return data + elif isinstance(data, dict): + return [data] + else: + raise ValueError(f"JSON file {file_path} must contain an array of objects or a single object") + + +def write_json_file(file_path: str, items: list): + """Write list of JSON objects to JSON file""" + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(items, f, ensure_ascii=False, indent=2) + + +def get_translation_prompt(source_lang: str, target_lang: str, prompt: str, config: Optional[Dict[str, Any]] = None) -> str: + """Generate a translation prompt based on source and target languages + + Args: + source_lang: Source language (e.g., "Rust", "Python") + target_lang: Target language (e.g., "R", "Python") + prompt: The code/prompt to translate + config: Configuration dictionary (if None, will load default config) + + Returns: + Formatted translation prompt + """ + # Load config if not provided + if config is None: + from synthetic_data_kit.utils.config import load_config + config = load_config() + + # Get translation prompt template from config + try: + template = get_prompt(config, "translation") + except ValueError: + # Fallback to generic template if not found in config + template = """Translate the following {source_lang} code/prompt to {target_lang}. + +Preserve the meaning and intent of the original code/prompt while adapting it to {target_lang} syntax and conventions. + +{source_lang} Code/Prompt: + +{prompt} + +{target_lang} Code/Prompt:""" + + return template.format( + source_lang=source_lang, + target_lang=target_lang, + prompt=prompt + ) + + +def translate_content( + content: str, + source_lang: str, + target_lang: str, + client: Optional[LLMClient] = None, + config_path: Optional[Path] = None, + api_base: Optional[str] = None, + model: Optional[str] = None, + provider: Optional[str] = None, + verbose: bool = False +) -> str: + """Translate content from source language to target language + + Args: + content: The content to translate + source_lang: Source language name + target_lang: Target language name + client: LLM client instance (if None, will create one) + config_path: Path to configuration file + api_base: API base URL + model: Model name + provider: LLM provider ('vllm' or 'api-endpoint') + verbose: Whether to print verbose output + + Returns: + Translated content + """ + # Create LLM client if not provided + if client is None: + client = LLMClient( + config_path=config_path, + provider=provider, + api_base=api_base, + model_name=model + ) + + # Generate translation prompt using config + translation_prompt = get_translation_prompt(source_lang, target_lang, content, client.config) + + # Create messages for LLM + messages = [ + {"role": "user", "content": translation_prompt} + ] + + if verbose: + print(f"Translating from {source_lang} to {target_lang}...") + print(f"Content length: {len(content)} characters") + + # Get translation from LLM + translated = client.chat_completion(messages) + + if verbose: + print(f"Translation completed. Output length: {len(translated)} characters") + + return translated.strip() + + +def process_file( + file_path: str, + output_dir: str, + source_lang: str, + target_lang: str, + config_path: Optional[Path] = None, + api_base: Optional[str] = None, + model: Optional[str] = None, + provider: Optional[str] = None, + verbose: bool = False, +) -> str: + """Process a file to translate its content + + Args: + file_path: Path to the JSON file to translate + output_dir: Directory to save translated content + source_lang: Source language name + target_lang: Target language name + config_path: Path to configuration file + api_base: API base URL + model: Model name + provider: LLM provider ('vllm' or 'api-endpoint') + verbose: Whether to print verbose output + + Returns: + Path to the output file + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Initialize LLM client + client = LLMClient( + config_path=config_path, + provider=provider, + api_base=api_base, + model_name=model + ) + + # Generate output filename + base_name = os.path.splitext(os.path.basename(file_path))[0] + + # Only support JSON files + if not file_path.endswith('.json'): + raise ValueError(f"Only JSON files are supported. Got: {file_path}") + + # Read JSON file + items = read_json_file(file_path) + + if verbose: + print(f"Found {len(items)} items in JSON file") + + # Translate each item + translated_items = [] + for i, item in enumerate(items): + if verbose: + print(f"Translating item {i+1}/{len(items)}...") + + # Only look for 'prompt' field + if 'prompt' not in item: + raise ValueError(f"Could not find 'prompt' field in JSON item {i+1}. Available fields: {list(item.keys())}") + + if not isinstance(item['prompt'], str): + raise ValueError(f"'prompt' field in JSON item {i+1} must be a string. Got: {type(item['prompt'])}") + + prompt_text = item['prompt'] + + # Translate the prompt + translated_prompt = translate_content( + content=prompt_text, + source_lang=source_lang, + target_lang=target_lang, + client=client, + verbose=verbose + ) + + # Create new item with both original and translated prompt + translated_item = item.copy() + # Keep original prompt and add translated prompt + translated_item['prompt_translated'] = translated_prompt + + translated_items.append(translated_item) + + # Write translated JSON file + output_filename = f"{base_name}_translated_{target_lang}.json" + output_path = os.path.join(output_dir, output_filename) + write_json_file(output_path, translated_items) + + if verbose: + print(f"Translated {len(translated_items)} items saved to: {output_path}") + + return output_path + diff --git a/synthetic_data_kit/utils/directory_processor.py b/synthetic_data_kit/utils/directory_processor.py index e0a72534..071d6f23 100644 --- a/synthetic_data_kit/utils/directory_processor.py +++ b/synthetic_data_kit/utils/directory_processor.py @@ -18,6 +18,7 @@ CREATE_EXTENSIONS = ['.txt', '.lance'] CURATE_EXTENSIONS = ['.json'] SAVE_AS_EXTENSIONS = ['.json'] +TRANSLATE_EXTENSIONS = ['.json'] def is_directory(path: str) -> bool: """Check if path is a directory""" @@ -621,4 +622,131 @@ def process_directory_save_as( console.print(f"Failed: {results['failed']}", style="red" if results['failed'] > 0 else "green") console.print("="*50, style="bold") + return results + +def process_directory_translate( + directory: str, + output_dir: Optional[str] = None, + config_path: Optional[str] = None, + api_base: Optional[str] = None, + model: Optional[str] = None, + source_lang: str = "Rust", + target_lang: str = "R", + verbose: bool = False, + provider: Optional[str] = None, +) -> Dict[str, Any]: + """Process all supported files in directory for translation + + Args: + directory: Directory containing files to translate + output_dir: Directory to save translated content + config_path: Path to configuration file + api_base: API base URL + model: Model to use + source_lang: Source language name + target_lang: Target language name + verbose: Show detailed progress + provider: LLM provider to use + + Returns: + Dictionary with processing results + """ + from synthetic_data_kit.core.translate import process_file + + # Get all supported files + supported_files = get_supported_files(directory, TRANSLATE_EXTENSIONS) + + if not supported_files: + console.print(f"No supported files found in {directory}", style="yellow") + console.print(f"Looking for files with extensions: {', '.join(TRANSLATE_EXTENSIONS)}", style="yellow") + return { + "total_files": 0, + "successful": 0, + "failed": 0, + "results": [], + "errors": [] + } + + console.print(f"Found {len(supported_files)} files to translate from {source_lang} to {target_lang}", style="blue") + + # Initialize results tracking + results = { + "total_files": len(supported_files), + "successful": 0, + "failed": 0, + "results": [], + "errors": [] + } + + # Process files with progress bar + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TextColumn("({task.completed}/{task.total})"), + TimeElapsedColumn(), + console=console, + disable=not verbose + ) as progress: + + task = progress.add_task(f"Translating {source_lang} -> {target_lang}", total=len(supported_files)) + + for file_path in supported_files: + filename = os.path.basename(file_path) + + try: + # Process individual file + output_path = process_file( + file_path, + output_dir, + source_lang, + target_lang, + config_path, + api_base, + model, + provider, + verbose + ) + + # Record success + results["successful"] += 1 + results["results"].append({ + "input_file": file_path, + "output_file": output_path, + "source_lang": source_lang, + "target_lang": target_lang, + "status": "success" + }) + + if verbose: + console.print(f"āœ“ Translated {filename} -> {os.path.basename(output_path)}", style="green") + else: + console.print(f"āœ“ {filename}", style="green") + + except Exception as e: + # Record failure + results["failed"] += 1 + results["errors"].append({ + "input_file": file_path, + "error": str(e), + "source_lang": source_lang, + "target_lang": target_lang, + "status": "failed" + }) + + if verbose: + console.print(f"āœ— Failed to translate {filename}: {e}", style="red") + else: + console.print(f"āœ— {filename}: {e}", style="red") + + progress.update(task, advance=1) + + # Show summary + console.print("\n" + "="*50, style="bold") + console.print(f"Translation Summary ({source_lang} -> {target_lang}):", style="bold blue") + console.print(f"Total files: {results['total_files']}") + console.print(f"Successful: {results['successful']}", style="green") + console.print(f"Failed: {results['failed']}", style="red" if results['failed'] > 0 else "green") + console.print("="*50, style="bold") + return results \ No newline at end of file