Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ generation:
overlap: 200 # Overlap between chunks to maintain context
max_tokens: 4096 # Maximum tokens in LLM responses
num_pairs: 25 # Default number of QA pairs to generate
num_pairs_per_chunk: null # Alternative: number of QA pairs per chunk (null = use num_pairs instead)
num_cot_examples: 5 # Default number of Chain of Thought examples to generate
num_cot_enhance_examples: null # Maximum number of conversations to enhance (null = enhance all)
batch_size: 32 # Number of requests to batch together (for create)
Expand Down
15 changes: 11 additions & 4 deletions synthetic_data_kit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,10 @@ def create(
None, "--model", "-m", help="Model to use"
),
num_pairs: Optional[int] = typer.Option(
None, "--num-pairs", "-n", help="Target number of QA pairs or CoT examples to generate"
None, "--num-pairs", "-n", help="Target number of QA pairs or CoT examples to generate (total per document)"
),
num_pairs_per_chunk: Optional[int] = typer.Option(
None, "--num-pairs-per-chunk", help="Number of QA pairs to generate per chunk (scales with document size, takes precedence over --num-pairs)"
),
chunk_size: Optional[int] = typer.Option(
None, "--chunk-size", help="Size of text chunks for processing large documents (default: 4000)"
Expand All @@ -313,9 +316,11 @@ def create(
- Directory: synthetic-data-kit create ./processed-text/ --type qa

Content types:
- qa: Generate question-answer pairs from .txt files (use --num-pairs to specify how many)
- qa: Generate question-answer pairs from .txt files
Use --num-pairs for total pairs per document OR --num-pairs-per-chunk to scale with document size
- summary: Generate summaries from .txt files
- cot: Generate Chain of Thought reasoning examples from .txt files (use --num-pairs to specify how many)
- cot: Generate Chain of Thought reasoning examples from .txt files
Use --num-pairs for total examples OR --num-pairs-per-chunk to scale with document size
- multimodal-qa: Generate question-answer pairs from .lance files (use --num-pairs to specify how many)
- cot-enhance: Enhance existing conversations with Chain of Thought reasoning from .json files
(use --num-pairs to limit the number of conversations to enhance, default is to enhance all)
Expand Down Expand Up @@ -411,6 +416,7 @@ def create(
model=model,
content_type=content_type,
num_pairs=num_pairs,
num_pairs_per_chunk=num_pairs_per_chunk,
verbose=verbose,
provider=provider,
chunk_size=chunk_size,
Expand Down Expand Up @@ -441,7 +447,8 @@ def create(
verbose,
provider=provider,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
chunk_overlap=chunk_overlap,
num_pairs_per_chunk=num_pairs_per_chunk
)
if output_path:
console.print(f"✅ Content saved to [bold]{output_path}[/bold]", style="green")
Expand Down
3 changes: 2 additions & 1 deletion synthetic_data_kit/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ generation:
max_tokens: 4096 # Maximum tokens in LLM responses

# Content generation targets
num_pairs: 25 # Default number of QA pairs to generate
num_pairs: 25 # Default number of QA pairs to generate (per document)
num_pairs_per_chunk: null # Alternative: number of QA pairs per chunk (null = use num_pairs instead)
num_cot_examples: 5 # Default number of Chain of Thought examples to generate
num_cot_enhance_examples: null # Maximum number of conversations to enhance (null = enhance all)

Expand Down
16 changes: 12 additions & 4 deletions synthetic_data_kit/core/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def process_file(
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
rolling_summary: Optional[bool] = False,
num_pairs_per_chunk: Optional[int] = None,
) -> str:
"""Process a file to generate content

Expand All @@ -48,8 +49,13 @@ def process_file(
api_base: VLLM API base URL
model: Model to use
content_type: Type of content to generate (qa, summary, cot)
num_pairs: Target number of QA pairs to generate
threshold: Quality threshold for filtering (1-10)
num_pairs: Target number of QA pairs to generate (total per document)
num_pairs_per_chunk: Number of QA pairs per chunk (takes precedence over num_pairs)
verbose: Show detailed output
provider: LLM provider to use
chunk_size: Size of text chunks
chunk_overlap: Overlap between chunks
rolling_summary: Use rolling summary for long documents

Returns:
Path to the output file
Expand Down Expand Up @@ -89,15 +95,17 @@ def process_file(
generator = QAGenerator(client, config_path)

# Get num_pairs from args or config
if num_pairs is None:
if num_pairs is None and num_pairs_per_chunk is None:
config = client.config
generation_config = get_generation_config(config)
num_pairs = generation_config.get("num_pairs", 25)
num_pairs_per_chunk = generation_config.get("num_pairs_per_chunk")

# Process document
result = generator.process_documents(
documents,
num_pairs=num_pairs,
num_pairs=num_pairs if num_pairs is not None else 25,
num_pairs_per_chunk=num_pairs_per_chunk,
verbose=verbose,
rolling_summary=rolling_summary
)
Expand Down
63 changes: 49 additions & 14 deletions synthetic_data_kit/generators/qa_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,19 @@ def generate_summary(self,
def generate_qa_pairs(self,
document_text: str,
summary: str,
num_pairs: int = 25) -> List[Dict[str, str]]:
"""Generate QA pairs from the document using batched processing"""
num_pairs: int = 25,
num_pairs_per_chunk: Optional[int] = None) -> List[Dict[str, str]]:
"""Generate QA pairs from the document using batched processing

Args:
document_text: The text to generate QA pairs from
summary: Summary of the document
num_pairs: Total number of QA pairs to generate (used if num_pairs_per_chunk is None)
num_pairs_per_chunk: Number of QA pairs to generate per chunk (takes precedence over num_pairs)

Returns:
List of QA pair dictionaries
"""
verbose = os.environ.get('SDK_VERBOSE', 'false').lower() == 'true'

# Get generation config
Expand All @@ -101,13 +112,25 @@ def generate_qa_pairs(self,
overlap=overlap
)

# Determine generation mode and calculate targets
if num_pairs_per_chunk is not None:
# Per-chunk mode: scale with document size
pairs_per_chunk = num_pairs_per_chunk
total_target = num_pairs_per_chunk * len(chunks)
mode = "per-chunk"
else:
# Total pairs mode: divide across chunks (original behavior)
pairs_per_chunk = max(1, round(num_pairs / len(chunks)))
total_target = num_pairs
mode = "total"

if verbose:
print(f"Generating QA pairs...")
print(f"Document split into {len(chunks)} chunks")
print(f"Mode: {mode} (pairs per chunk: {pairs_per_chunk}, target total: {total_target})")
print(f"Using batch size of {batch_size}")

all_qa_pairs = []
pairs_per_chunk = max(1, round(num_pairs / len(chunks)))

# Get QA generation prompt template
qa_prompt_template = get_prompt(self.config, "qa_generation")
Expand Down Expand Up @@ -151,9 +174,9 @@ def generate_qa_pairs(self,
# Process in batches
for batch_start in range(0, len(chunks), batch_size):
# Check if we've already generated enough pairs
if len(all_qa_pairs) >= num_pairs:
if len(all_qa_pairs) >= total_target:
if verbose:
print(f"Reached target of {num_pairs} pairs. Stopping processing.")
print(f"Reached target of {total_target} pairs. Stopping processing.")
break

batch_end = min(batch_start + batch_size, len(chunks))
Expand All @@ -180,33 +203,33 @@ def generate_qa_pairs(self,
# Process each response in the batch
for j, response in enumerate(batch_responses):
# Check if we've reached the target before processing more
if len(all_qa_pairs) >= num_pairs:
if len(all_qa_pairs) >= total_target:
if verbose:
print(f" Reached target of {num_pairs} pairs. Stopping batch processing.")
print(f" Reached target of {total_target} pairs. Stopping batch processing.")
break

chunk_index = batch_start + j
chunk_pairs = parse_qa_pairs(response)

# Only add pairs up to the target limit
remaining_pairs = num_pairs - len(all_qa_pairs)
remaining_pairs = total_target - len(all_qa_pairs)
if remaining_pairs > 0:
pairs_to_add = chunk_pairs[:remaining_pairs]
all_qa_pairs.extend(pairs_to_add)

if verbose:
print(f" Generated {len(pairs_to_add)} pairs from chunk {chunk_index+1} (total: {len(all_qa_pairs)}/{num_pairs})")
print(f" Generated {len(pairs_to_add)} pairs from chunk {chunk_index+1} (total: {len(all_qa_pairs)}/{total_target})")

# Break if we've reached the target
if len(all_qa_pairs) >= num_pairs:
if len(all_qa_pairs) >= total_target:
break

# Update progress bar if in verbose mode
if progress_ctx and generate_task:
progress_ctx.update(generate_task, advance=current_batch_size)

# Break outer loop if we've reached the target
if len(all_qa_pairs) >= num_pairs:
if len(all_qa_pairs) >= total_target:
break

except Exception as e:
Expand All @@ -227,7 +250,7 @@ def generate_qa_pairs(self,
print("Batch processing complete.")

# Always print summary information, even in non-verbose mode
print(f"Generated {len(all_qa_pairs)} QA pairs total (requested: {num_pairs})")
print(f"Generated {len(all_qa_pairs)} QA pairs total (target: {total_target}, mode: {mode})")
return all_qa_pairs

def rate_qa_pairs(self,
Expand Down Expand Up @@ -321,9 +344,21 @@ def rate_qa_pairs(self,
def process_documents(self,
documents: List[Dict[str, Any]],
num_pairs: int = 25,
num_pairs_per_chunk: Optional[int] = None,
verbose: bool = False,
rolling_summary: Optional[bool] = False) -> Dict[str, Any]:
"""Process a list of documents to generate QA pairs without rating"""
"""Process a list of documents to generate QA pairs without rating

Args:
documents: List of document dictionaries with 'text' field
num_pairs: Total number of QA pairs to generate (used if num_pairs_per_chunk is None)
num_pairs_per_chunk: Number of QA pairs per chunk (takes precedence over num_pairs)
verbose: Whether to show detailed output
rolling_summary: Whether to use rolling summary for long documents

Returns:
Dictionary with summary and qa_pairs
"""
# Set the verbose environment variable
if verbose:
os.environ['SDK_VERBOSE'] = 'true'
Expand All @@ -337,7 +372,7 @@ def process_documents(self,
summary = self.generate_summary(full_text, rolling_summary=rolling_summary)

# Generate QA pairs
qa_pairs = self.generate_qa_pairs(full_text, summary, num_pairs=num_pairs)
qa_pairs = self.generate_qa_pairs(full_text, summary, num_pairs=num_pairs, num_pairs_per_chunk=num_pairs_per_chunk)

all_qa_pairs.extend(qa_pairs)

Expand Down
9 changes: 7 additions & 2 deletions synthetic_data_kit/utils/directory_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ def process_directory_create(
model: Optional[str] = None,
content_type: str = "qa",
num_pairs: Optional[int] = None,
num_pairs_per_chunk: Optional[int] = None,
verbose: bool = False,
provider: Optional[str] = None,
chunk_size: Optional[int] = None,
Expand All @@ -233,9 +234,12 @@ def process_directory_create(
api_base: API base URL
model: Model to use
content_type: Type of content to generate (qa, summary, cot, cot-enhance)
num_pairs: Target number of QA pairs or examples
num_pairs: Target number of QA pairs or examples (total per document)
num_pairs_per_chunk: Number of QA pairs per chunk (takes precedence over num_pairs)
verbose: Show detailed progress
provider: LLM provider to use
chunk_size: Size of text chunks
chunk_overlap: Overlap between chunks

Returns:
Dictionary with processing results
Expand Down Expand Up @@ -310,7 +314,8 @@ def process_directory_create(
verbose,
provider=provider,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
chunk_overlap=chunk_overlap,
num_pairs_per_chunk=num_pairs_per_chunk
)

# Record success
Expand Down
Loading