hdbscan_ohlcv/main.py at main · dtau00/hdbscan_ohlcv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
#!/usr/bin/env python3
"""Main Orchestration Script for HDBSCAN OHLCV Pattern Discovery

This script coordinates the entire hyperparameter tuning process:
1. Setup: logging, backend detection, output directory
2. Load OHLCV data
3. Generate all valid configurations
4. Main loop: for each config
   - Create windows
   - Extract features
   - Normalize features (fit scaler per window_size)
   - Run HDBSCAN
   - Compute metrics
   - Save results
   - Log progress
5. Summary: load metrics, print stats, save report
"""

import logging
import sys
from pathlib import Path
from typing import Dict, Any
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from tqdm import tqdm

# Import project modules
from src.config import Config
from src.gpu_utils import detect_compute_backend
from src.data_loader import OHLCVDataLoader
from src.feature_engineering import FeatureExtractor
from src.clustering import HDBSCANClusterer
from src.metrics import ClusterMetrics
from src.storage import ResultsStorage


def setup_logging() -> logging.Logger:
    """
    Configure logging for the application.

    Returns:
        Logger instance for main module
    """
    # Ensure directories exist
    Config.ensure_dirs()

    # Get log file path with timestamp
    log_file = Config.get_log_file_path("hdbscan_ohlcv_main")

    # Configure logging
    logging.basicConfig(
        level=getattr(logging, Config.LOG_LEVEL),
        format=Config.LOG_FORMAT,
        datefmt=Config.LOG_DATE_FORMAT,
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler(sys.stdout)
        ]
    )

    logger = logging.getLogger(__name__)
    logger.info("="*80)
    logger.info("HDBSCAN OHLCV Pattern Discovery - Main Pipeline")
    logger.info("="*80)
    logger.info(f"Log file: {log_file}")

    return logger


def load_or_generate_data(logger: logging.Logger, n_bars: int = 1000) -> pd.DataFrame:
    """
    Load OHLCV data from file or generate synthetic data for testing.

    Args:
        logger: Logger instance
        n_bars: Number of bars to generate if no data file found

    Returns:
        DataFrame with OHLCV data
    """
    logger.info("Loading OHLCV data...")

    # Check for data files in data directory
    data_files = list(Config.DATA_DIR.glob("*.csv"))

    if data_files:
        # Load the first CSV file found
        data_file = data_files[0]
        logger.info(f"Loading data from {data_file}")
        df = pd.read_csv(data_file)

        # Validate required columns
        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
        missing_cols = [col for col in required_cols if col not in df.columns]

        if missing_cols:
            logger.warning(f"Missing columns in data file: {missing_cols}")
            logger.info("Generating synthetic data instead...")
            df = generate_synthetic_ohlcv(n_bars)
        else:
            logger.info(f"Loaded {len(df)} bars from {data_file}")
    else:
        logger.info(f"No data files found in {Config.DATA_DIR}")
        logger.info(f"Generating synthetic OHLCV data with {n_bars} bars...")
        df = generate_synthetic_ohlcv(n_bars)

    return df


def generate_synthetic_ohlcv(n_bars: int = 1000, seed: int = 42) -> pd.DataFrame:
    """
    Generate synthetic OHLCV data for testing.

    Creates realistic-looking price data with proper OHLC relationships.

    Args:
        n_bars: Number of bars to generate
        seed: Random seed for reproducibility

    Returns:
        DataFrame with OHLCV columns
    """
    np.random.seed(seed)

    # Generate base price as random walk
    returns = np.random.randn(n_bars) * 0.02  # 2% volatility
    price = 100 * np.exp(np.cumsum(returns))

    # Generate OHLC values
    close = price
    open_ = close + np.random.randn(n_bars) * 0.5

    # High and Low should respect Open and Close
    oc_max = np.maximum(open_, close)
    oc_min = np.minimum(open_, close)

    high = oc_max + np.abs(np.random.randn(n_bars)) * 0.5
    low = oc_min - np.abs(np.random.randn(n_bars)) * 0.5

    # Volume (random but realistic)
    volume = np.random.exponential(scale=1000000, size=n_bars)

    # Create DataFrame
    df = pd.DataFrame({
        'Open': open_,
        'High': high,
        'Low': low,
        'Close': close,
        'Volume': volume
    })

    return df


def process_single_config(
    config: Dict[str, Any],
    df_ohlcv: pd.DataFrame,
    backend_type: str,
    backend_module: Any,
    storage: ResultsStorage,
    logger: logging.Logger,
    scalers_cache: Dict[int, StandardScaler],
    feature_type: str = 'normalized'
) -> Dict[str, Any]:
    """
    Process a single configuration: window creation, feature extraction,
    clustering, metrics computation, and result storage.

    Args:
        config: Configuration dictionary
        df_ohlcv: OHLCV DataFrame
        backend_type: Compute backend type ('gpu' or 'cpu')
        backend_module: Backend module reference
        storage: ResultsStorage instance
        logger: Logger instance
        scalers_cache: Cache of fitted scalers per window_size

    Returns:
        Dictionary with run results (run_id, metrics, etc.)
    """
    window_size = config['window_size']
    stride = config.get('stride', 1)  # Default to 1 if not specified
    config_id = Config.get_config_id(config)

    logger.info(f"Processing config: {config_id}")
    logger.info(f"  Parameters: window_size={window_size}, stride={stride}, "
                f"min_cluster_size={config['min_cluster_size']}, "
                f"min_samples={config['min_samples']}, "
                f"metric={config['metric']}, "
                f"method={config['cluster_selection_method']}")

    try:
        # Step 1: Create windows
        logger.debug("Creating windows...")
        data_loader = OHLCVDataLoader(df_ohlcv)
        windows = data_loader.create_windows(window_size, stride=stride)
        logger.info(f"  Created {len(windows)} windows of size {window_size} (stride={stride})")

        # Step 2: Extract features
        logger.debug("Extracting features...")
        feature_extractor = FeatureExtractor(feature_type=feature_type, flatten_order='sequential')
        features = feature_extractor.extract_features(windows)
        logger.info(f"  Extracted features: shape={features.shape} (using {feature_type} features)")

        # Step 3: Normalize features (only needed for 'flatten' feature type)
        # Normalized and returns features are already in comparable scales
        if feature_type == 'flatten':
            # Use cached scaler if available for this window_size, otherwise fit new one
            if window_size not in scalers_cache:
                logger.debug(f"Fitting new StandardScaler for window_size={window_size}")
                scaler = StandardScaler()
                features_normalized = scaler.fit_transform(features)
                scalers_cache[window_size] = scaler
            else:
                logger.debug(f"Using cached StandardScaler for window_size={window_size}")
                scaler = scalers_cache[window_size]
                features_normalized = scaler.transform(features)
            logger.info(f"  Standardized features: mean={features_normalized.mean():.4f}, "
                       f"std={features_normalized.std():.4f}")
        else:
            # Skip StandardScaler for normalized/returns - they're already scaled
            features_normalized = features
            logger.info(f"  Features already scaled (type={feature_type}), skipping StandardScaler")

        # Step 4: Run HDBSCAN clustering
        logger.debug("Running HDBSCAN clustering...")
        clusterer_wrapper = HDBSCANClusterer(backend_type, backend_module)
        labels, clusterer = clusterer_wrapper.fit_predict(
            features_normalized,
            min_cluster_size=config['min_cluster_size'],
            min_samples=config['min_samples'],
            metric=config['metric'],
            cluster_selection_method=config['cluster_selection_method']
        )

        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = np.sum(labels == -1)
        logger.info(f"  Clustering complete: {n_clusters} clusters, {n_noise} noise points")

        # Step 5: Compute metrics
        logger.debug("Computing metrics...")
        metrics = ClusterMetrics.compute_metrics(labels, features_normalized)
        logger.info(f"  Metrics: n_clusters={metrics['n_clusters']}, "
                   f"noise_ratio={metrics['noise_ratio']:.3f}")

        if metrics.get('silhouette_score') is not None:
            logger.info(f"  Quality: silhouette={metrics['silhouette_score']:.4f}, "
                       f"davies_bouldin={metrics.get('davies_bouldin_score', 0):.4f}")

        # Step 6: Save results
        logger.debug("Saving results...")
        run_id = storage.save_run_results(
            config=config,
            labels=labels,
            metrics=metrics,
            clusterer=clusterer,
            features=None,  # Don't save features by default to save space
            save_clusterer=True,
            save_features=False
        )

        logger.info(f"  Results saved with run_id={run_id}")

        return {
            'run_id': run_id,
            'config': config,
            'config_id': config_id,
            'metrics': metrics,
            'success': True,
            'error': None
        }

    except Exception as e:
        logger.error(f"  Error processing config {config_id}: {e}", exc_info=True)
        return {
            'run_id': None,
            'config': config,
            'config_id': config_id,
            'metrics': None,
            'success': False,
            'error': str(e)
        }


def print_summary_report(storage: ResultsStorage, logger: logging.Logger) -> None:
    """
    Print and save a summary report of all results.

    Args:
        storage: ResultsStorage instance
        logger: Logger instance
    """
    logger.info("\n" + "="*80)
    logger.info("GENERATING SUMMARY REPORT")
    logger.info("="*80)

    try:
        df = storage.load_metrics_dataframe()

        # Print summary statistics
        logger.info(f"\nTotal runs completed: {len(df)}")
        logger.info(f"\nConfiguration coverage:")
        logger.info(f"  Window sizes: {sorted(df['window_size'].unique())}")
        logger.info(f"  Min cluster sizes: {sorted(df['min_cluster_size'].unique())}")
        logger.info(f"  Min samples: {sorted(df['min_samples'].unique())}")
        logger.info(f"  Metrics: {sorted(df['metric'].unique())}")
        logger.info(f"  Methods: {sorted(df['cluster_selection_method'].unique())}")

        logger.info(f"\nClustering statistics:")
        logger.info(f"  Total clusters found: {df['n_clusters'].sum()}")
        logger.info(f"  Average clusters per run: {df['n_clusters'].mean():.2f} ± {df['n_clusters'].std():.2f}")
        logger.info(f"  Min clusters: {df['n_clusters'].min()}")
        logger.info(f"  Max clusters: {df['n_clusters'].max()}")
        logger.info(f"  Average noise ratio: {df['noise_ratio'].mean()*100:.2f}% ± {df['noise_ratio'].std()*100:.2f}%")

        # Quality metrics (if available)
        if 'silhouette_score' in df.columns and df['silhouette_score'].notna().any():
            logger.info(f"\nQuality metrics (average ± std):")
            logger.info(f"  Silhouette score: {df['silhouette_score'].mean():.4f} ± {df['silhouette_score'].std():.4f}")
            logger.info(f"  Davies-Bouldin score: {df['davies_bouldin_score'].mean():.4f} ± {df['davies_bouldin_score'].std():.4f}")
            logger.info(f"  Calinski-Harabasz score: {df['calinski_harabasz_score'].mean():.1f} ± {df['calinski_harabasz_score'].std():.1f}")

            # Best runs
            logger.info(f"\nTop 3 runs by Silhouette Score:")
            top_runs = df.nlargest(3, 'silhouette_score')[
                ['run_id', 'config_id', 'n_clusters', 'silhouette_score', 'noise_ratio']
            ]
            logger.info(f"\n{top_runs.to_string(index=False)}")

        # Save summary to file
        summary_file = Config.get_results_path('metrics', f'summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt')
        with open(summary_file, 'w') as f:
            f.write("HDBSCAN OHLCV Pattern Discovery - Summary Report\n")
            f.write("="*80 + "\n\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write(f"Total runs: {len(df)}\n\n")
            f.write(df.describe().to_string())
            f.write("\n\n")
            f.write("All runs:\n")
            f.write(df.to_string())

        logger.info(f"\nSummary report saved to: {summary_file}")

    except FileNotFoundError:
        logger.warning("No metrics file found - no results to summarize")
    except Exception as e:
        logger.error(f"Error generating summary report: {e}", exc_info=True)


def main(use_parallel: bool = True, n_jobs: int = -1):
    """
    Main orchestration function.

    Coordinates the entire hyperparameter tuning pipeline.

    Args:
        use_parallel: Enable parallel execution (default: True)
        n_jobs: Number of parallel jobs (-1 = all cores, default: -1)
    """
    # Step 1: Setup
    logger = setup_logging()

    logger.info("\n" + "="*80)
    logger.info("STEP 1: SETUP")
    logger.info("="*80)

    # Detect compute backend
    logger.info("Detecting compute backend...")
    backend_type, backend_module = detect_compute_backend()
    logger.info(f"Using backend: {backend_type.upper()}")

    # Create output directories
    Config.ensure_dirs()
    logger.info(f"Output directory: {Config.RESULTS_DIR}")

    # Initialize results storage
    storage = ResultsStorage()
    logger.info(f"Next run ID will start at: {storage.run_counter}")

    # Step 2: Load data
    logger.info("\n" + "="*80)
    logger.info("STEP 2: LOAD DATA")
    logger.info("="*80)

    # Start with small dataset for testing (can be increased)
    df_ohlcv = load_or_generate_data(logger, n_bars=1000)
    logger.info(f"Loaded OHLCV data: {len(df_ohlcv)} bars")
    logger.info(f"Data columns: {list(df_ohlcv.columns)}")
    logger.info(f"Data shape: {df_ohlcv.shape}")

    # Step 3: Generate configurations
    logger.info("\n" + "="*80)
    logger.info("STEP 3: GENERATE CONFIGURATIONS")
    logger.info("="*80)

    configs = Config.generate_hdbscan_configs()
    logger.info(f"Generated {len(configs)} valid configurations")
    Config.print_config_summary(configs)

    # Step 4: Main loop - process each configuration
    logger.info("\n" + "="*80)
    logger.info("STEP 4: MAIN PROCESSING LOOP")
    logger.info("="*80)

    # Track results
    results = []
    successful = 0
    failed = 0

    if use_parallel:
        # Parallel execution
        from src.parallel_grid_search import parallel_grid_search, get_optimal_n_jobs

        n_jobs_actual = get_optimal_n_jobs(n_jobs)
        logger.info(f"Using parallel execution with {n_jobs_actual} workers")

        results = parallel_grid_search(
            configs=configs,
            df_ohlcv=df_ohlcv,
            backend_type=backend_type,
            backend_module=backend_module,
            storage=storage,
            process_func=process_single_config,
            n_jobs=n_jobs,
            verbose=10
        )

        # Count successes and failures
        successful = sum(1 for r in results if r['success'])
        failed = len(results) - successful
    else:
        # Sequential execution
        logger.info("Using sequential execution")

        # Cache for StandardScalers (one per window_size)
        scalers_cache = {}

        # Progress bar
        for i, config in enumerate(tqdm(configs, desc="Processing configs", unit="config"), 1):
            logger.info(f"\n{'─'*80}")
            logger.info(f"Processing configuration {i}/{len(configs)}")
            logger.info(f"{'─'*80}")

            result = process_single_config(
                config=config,
                df_ohlcv=df_ohlcv,
                backend_type=backend_type,
                backend_module=backend_module,
                storage=storage,
                logger=logger,
                scalers_cache=scalers_cache
            )

            results.append(result)

            if result['success']:
                successful += 1
            else:
                failed += 1

    # Step 5: Summary
    logger.info("\n" + "="*80)
    logger.info("STEP 5: SUMMARY")
    logger.info("="*80)

    logger.info(f"\nProcessing complete!")
    logger.info(f"  Successful runs: {successful}")
    logger.info(f"  Failed runs: {failed}")
    logger.info(f"  Total configurations: {len(configs)}")

    if failed > 0:
        logger.warning(f"\nFailed configurations:")
        for result in results:
            if not result['success']:
                logger.warning(f"  {result['config_id']}: {result['error']}")

    # Print summary report
    print_summary_report(storage, logger)

    # Final storage summary
    storage.print_summary()

    logger.info("\n" + "="*80)
    logger.info("PIPELINE COMPLETE")
    logger.info("="*80)
    logger.info(f"Results saved to: {Config.RESULTS_DIR}")
    logger.info(f"  Metrics CSV: {storage.metrics_file}")
    logger.info(f"  Labels: {Config.LABELS_DIR}")
    logger.info(f"  Models: {Config.MODELS_DIR}")
    logger.info(f"  Logs: {Config.LOGS_DIR}")

    return 0


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="HDBSCAN OHLCV Pattern Discovery - Hyperparameter Grid Search",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run with parallel execution using all cores (default)
  python main.py

  # Run with sequential execution
  python main.py --no-parallel

  # Run with 4 parallel workers
  python main.py --n-jobs 4

  # Run with half the available cores
  python main.py --n-jobs -2
        """
    )

    parser.add_argument(
        '--no-parallel',
        action='store_true',
        help='Disable parallel execution (run sequentially)'
    )

    parser.add_argument(
        '--n-jobs',
        type=int,
        default=-1,
        help='Number of parallel jobs. -1 = all cores (default), -N = all except N cores, N = use N cores'
    )

    args = parser.parse_args()

    try:
        use_parallel = not args.no_parallel
        exit_code = main(use_parallel=use_parallel, n_jobs=args.n_jobs)
        sys.exit(exit_code)
    except KeyboardInterrupt:
        print("\n\nInterrupted by user. Exiting...")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nFatal error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc()
        sys.exit(1)