Examples and Tutorials

Practical examples demonstrating how to use bs-cpg for real-world tasks.

Example 1: Working with Sample Data

This example shows how to load and explore sample CpG data included with bs-cpg.

from bs_cpg.setup import read_sample_cpg, get_base_data_path
import pandas as pd

# Load sample data
df = read_sample_cpg()
print(f"Sample data shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# Check data path configuration
base_path = get_base_data_path()
print(f"\nData directory: {base_path}")

# Get basic statistics
print(f"\nChromosomes in sample: {df['chromosome'].unique()[:10]}")
print(f"Position range: {df['pos'].min()} - {df['pos'].max()}")

Example 2: Validating CpG Sites

Learn how to fetch reference sequences and validate that positions contain valid CpG dinucleotides.

from bs_cpg.liftover_ps import cpg_reads, cpg_percent

# Load sample data
df = read_sample_cpg(["chromosome", "pos"])

# Fetch 2-bp sequences at each position
print("Fetching reference sequences...")
reads = cpg_reads(
    chromosomes=df["chromosome"],
    positions=df["pos"],
    ref_genome="hg19",
    index_base=1
)

print(f"\nFetched {len(reads)} sequences")
print(f"Sample sequences: {reads.head()}")

# Calculate percentage of valid CpG sites
percent_cg = cpg_percent(reads)
print(f"\nPercentage of valid CpG sites: {percent_cg:.2f}%")

# Show distribution of sequences
seq_counts = reads.value_counts()
print(f"\nMost common sequences:")
print(seq_counts.head())

Example 3: Genomic Liftover (hg19 to hg38)

Convert genomic coordinates from one reference genome build to another.

from bs_cpg.liftover_ps import liftover_positions

# Load sample data
df = read_sample_cpg(["chromosome", "pos"])

# Select a subset for this example
subset_chroms = df["chromosome"].head(5).values
subset_pos = df["pos"].head(5).values

print(f"Input positions (hg19):")
for chrom, pos in zip(subset_chroms, subset_pos):
    print(f"  {chrom}:{pos}")

# Perform liftover
print("\nPerforming liftover to hg38...")
new_chroms, new_pos = liftover_positions(
    chromosomes=subset_chroms,
    positions=subset_pos,
    genome_from="hg19",
    genome_to="hg38",
    index_base_from=1,
    index_base_to=1
)

# Show results
print("\nLifted positions (hg38):")
for old_chr, old_pos, new_chr, new_p in zip(subset_chroms, subset_pos, new_chroms, new_pos):
    status = "✓ Mapped" if new_chr is not None else "✗ Unmapped"
    mapped_str = f"{new_chr}:{new_p}" if new_chr else "N/A"
    print(f"  {old_chr}:{old_pos} → {mapped_str} {status}")

Example 4: Detecting Genome Build and Index Base

When you have data from an unknown source, use automatic detection to find the correct genome and indexing system.

from bs_cpg.liftover_ps import guess_ref_and_index_base

# Load sample data
df = read_sample_cpg(["chromosome", "pos"])

# Use only a small subset for speed
sample_chroms = df["chromosome"].head(100)
sample_pos = df["pos"].head(100)

print("Detecting reference genome and index base...")
print("(Testing: hg19/hg38 × 0-based/1-based)")
print()

result = guess_ref_and_index_base(
    chromosomes=sample_chroms,
    positions=sample_pos,
    ref_genomes=["hg19", "hg38"],
    index_bases=[0, 1]
)

# Display results
print(f"Best guess:")
print(f"  Genome: {result['best_genome']}")
print(f"  Index base: {result['best_index_base']}")
print(f"  CpG match: {result['best_percent']:.2f}%")

print(f"\nAll results:")
results_df = pd.DataFrame(result['results'])
results_df = results_df[['ref_genome', 'index_base', 'percent_cg']].sort_values('percent_cg', ascending=False)
print(results_df.to_string(index=False))

Example 5: Downloading Reference Genomes

Download and prepare reference genomes and liftover chain files for offline use.

from bs_cpg.download_ref import get_ref_genome, get_liftover_chain
from pathlib import Path

# Download hg38 reference genome
print("Downloading hg38 reference genome...")
ref_path = get_ref_genome("hg38")
print(f"Reference saved to: {ref_path}")
print(f"File size: {Path(ref_path).stat().st_size / (1024**3):.2f} GB")

# Download liftover chain file
print("\nDownloading hg19->hg38 chain file...")
chain_path = get_liftover_chain("hg19", "hg38")
print(f"Chain file saved to: {chain_path}")
print(f"File size: {Path(chain_path).stat().st_size / (1024**2):.2f} MB")

Example 6: Complete Pipeline - From GEO to Liftover

A full example showing how to combine multiple bs-cpg functions in a realistic workflow.

# This is a complete example that combines multiple modules
# For demonstration, we'll use the sample data

from bs_cpg.setup import read_sample_cpg
from bs_cpg.liftover_ps import cpg_reads, cpg_percent, liftover_positions

# Step 1: Load data
print("Step 1: Loading CpG data...")
df = read_sample_cpg(["chromosome", "pos"])
print(f"  Loaded {len(df)} CpG sites")

# Step 2: Validate against hg19
print("\nStep 2: Validating against hg19...")
reads = cpg_reads(df["chromosome"], df["pos"], "hg19", index_base=1)
valid_percent = cpg_percent(reads)
print(f"  {valid_percent:.2f}% are valid CpG sites")

# Step 3: Liftover to hg38
print("\nStep 3: Lifting over to hg38...")
new_chroms, new_pos = liftover_positions(
    chromosomes=df["chromosome"],
    positions=df["pos"],
    genome_from="hg19",
    genome_to="hg38",
    index_base_from=1
)
mapped_count = sum(1 for c in new_chroms if c is not None)
print(f"  {mapped_count}/{len(df)} positions successfully lifted")

# Step 4: Create output dataframe
print("\nStep 4: Creating output dataframe...")
output_df = pd.DataFrame({
    "chrom_hg19": df["chromosome"],
    "pos_hg19": df["pos"],
    "chrom_hg38": new_chroms,
    "pos_hg38": new_pos,
})

print(f"\nPipeline complete!")
print(f"Output dataframe columns: {output_df.columns.tolist()}")

Tips and Best Practices

Performance Considerations

Caching: Reference genomes and chain files are cached locally. Subsequent runs use cached files, making them much faster.
Batch Processing: Process large datasets in chunks to manage memory usage.
Network Resilience: The package automatically retries failed downloads with exponential backoff.

Common Issues

Missing htslib: Ensure bgzip from htslib is installed and in your PATH for reference genome processing.
Large Files: Reference genomes can be >1GB. Ensure adequate disk space in your configured data directory.
Coordinate Systems: Always verify your input coordinates are in the expected base system (0-based or 1-based).

Working with Your Own Data

# Instead of using read_sample_cpg(), load your own data
import pandas as pd

# Load your data
df = pd.read_csv("my_cpg_data.csv")

# Use bs_cpg functions with your data
reads = cpg_reads(df["chromosome"], df["position"], "hg38", index_base=1)