-
Notifications
You must be signed in to change notification settings - Fork 33
Expand file tree
/
Copy pathdata_prep.py
More file actions
91 lines (73 loc) · 2.59 KB
/
data_prep.py
File metadata and controls
91 lines (73 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
'''GSM8K'''
import json
from datasets import load_dataset
# === Step 1: Load GSM8K dataset ===
gsm8k = load_dataset("openai/gsm8k", "main")
# === Step 2: Save Training Set with answers ===
with open("gsm8k_train.jsonl", "w", encoding="utf-8") as f_train:
for ex in gsm8k["train"]:
json.dump({
"id": ex["question"], # Use question as unique ID
"question": ex["question"],
"answer": ex["answer"]
}, f_train)
f_train.write("\n")
print(f"✅ Saved {len(gsm8k['train'])} training examples to gsm8k_train.jsonl")
# === Step 3: Save Test Set ===
with open("gsm8k_test.jsonl", "w", encoding="utf-8") as f_test:
for ex in gsm8k["test"]:
json.dump({
"id": ex["question"], # Use question as unique ID
"question": ex["question"],
"answer": ex["answer"]
}, f_test)
f_test.write("\n")
print(f"✅ Saved {len(gsm8k['test'])} test examples to gsm8k_test.jsonl")
'''MATH'''
from datasets import load_dataset
import json
from tqdm import tqdm
# Subjects to load
subjects = [
"algebra",
"counting_and_probability",
"geometry",
"intermediate_algebra",
"number_theory",
"prealgebra",
"precalculus"
]
# Prepare storage
train_examples = []
test_examples = []
# Load each subject and gather examples
for subject in subjects:
print(f"🔵 Loading subject: {subject}")
dataset = load_dataset("EleutherAI/hendrycks_math", subject)
# Add subject field manually, since they don't have it
for ex in tqdm(dataset["train"], desc=f"Processing train split ({subject})"):
train_examples.append({
"id": ex["problem"], # use problem as unique ID
"problem": ex["problem"],
"solution": ex["solution"],
"level": subject # mark where it comes from
})
for ex in tqdm(dataset["test"], desc=f"Processing test split ({subject})"):
test_examples.append({
"id": ex["problem"],
"problem": ex["problem"],
"solution": ex["solution"],
"level": subject
})
# === Save Training Set ===
with open("math_train.jsonl", "w", encoding="utf-8") as f_train:
for ex in train_examples:
json.dump(ex, f_train)
f_train.write("\n")
print(f"✅ Saved {len(train_examples)} training examples to math_train.jsonl")
# === Save Test Set ===
with open("math_test.jsonl", "w", encoding="utf-8") as f_test:
for ex in test_examples:
json.dump(ex, f_test)
f_test.write("\n")
print(f"✅ Saved {len(test_examples)} test examples to math_test.jsonl")