from bs_cpg.setup import read_sample_cpg, get_base_data_path
import pandas as pd
# Load sample data
df = read_sample_cpg()
print(f"Sample data shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())Examples and Tutorials
Practical examples demonstrating how to use bs-cpg for real-world tasks.
Example 1: Working with Sample Data
This example shows how to load and explore sample CpG data included with bs-cpg.
# Check data path configuration
base_path = get_base_data_path()
print(f"\nData directory: {base_path}")
# Get basic statistics
print(f"\nChromosomes in sample: {df['chromosome'].unique()[:10]}")
print(f"Position range: {df['pos'].min()} - {df['pos'].max()}")Example 2: Validating CpG Sites
Learn how to fetch reference sequences and validate that positions contain valid CpG dinucleotides.
from bs_cpg.liftover_ps import cpg_reads, cpg_percent
# Load sample data
df = read_sample_cpg(["chromosome", "pos"])
# Fetch 2-bp sequences at each position
print("Fetching reference sequences...")
reads = cpg_reads(
chromosomes=df["chromosome"],
positions=df["pos"],
ref_genome="hg19",
index_base=1
)
print(f"\nFetched {len(reads)} sequences")
print(f"Sample sequences: {reads.head()}")# Calculate percentage of valid CpG sites
percent_cg = cpg_percent(reads)
print(f"\nPercentage of valid CpG sites: {percent_cg:.2f}%")
# Show distribution of sequences
seq_counts = reads.value_counts()
print(f"\nMost common sequences:")
print(seq_counts.head())Example 3: Genomic Liftover (hg19 to hg38)
Convert genomic coordinates from one reference genome build to another.
from bs_cpg.liftover_ps import liftover_positions
# Load sample data
df = read_sample_cpg(["chromosome", "pos"])
# Select a subset for this example
subset_chroms = df["chromosome"].head(5).values
subset_pos = df["pos"].head(5).values
print(f"Input positions (hg19):")
for chrom, pos in zip(subset_chroms, subset_pos):
print(f" {chrom}:{pos}")# Perform liftover
print("\nPerforming liftover to hg38...")
new_chroms, new_pos = liftover_positions(
chromosomes=subset_chroms,
positions=subset_pos,
genome_from="hg19",
genome_to="hg38",
index_base_from=1,
index_base_to=1
)
# Show results
print("\nLifted positions (hg38):")
for old_chr, old_pos, new_chr, new_p in zip(subset_chroms, subset_pos, new_chroms, new_pos):
status = "✓ Mapped" if new_chr is not None else "✗ Unmapped"
mapped_str = f"{new_chr}:{new_p}" if new_chr else "N/A"
print(f" {old_chr}:{old_pos} → {mapped_str} {status}")Example 4: Detecting Genome Build and Index Base
When you have data from an unknown source, use automatic detection to find the correct genome and indexing system.
from bs_cpg.liftover_ps import guess_ref_and_index_base
# Load sample data
df = read_sample_cpg(["chromosome", "pos"])
# Use only a small subset for speed
sample_chroms = df["chromosome"].head(100)
sample_pos = df["pos"].head(100)
print("Detecting reference genome and index base...")
print("(Testing: hg19/hg38 × 0-based/1-based)")
print()
result = guess_ref_and_index_base(
chromosomes=sample_chroms,
positions=sample_pos,
ref_genomes=["hg19", "hg38"],
index_bases=[0, 1]
)# Display results
print(f"Best guess:")
print(f" Genome: {result['best_genome']}")
print(f" Index base: {result['best_index_base']}")
print(f" CpG match: {result['best_percent']:.2f}%")
print(f"\nAll results:")
results_df = pd.DataFrame(result['results'])
results_df = results_df[['ref_genome', 'index_base', 'percent_cg']].sort_values('percent_cg', ascending=False)
print(results_df.to_string(index=False))Example 5: Downloading Reference Genomes
Download and prepare reference genomes and liftover chain files for offline use.
from bs_cpg.download_ref import get_ref_genome, get_liftover_chain
from pathlib import Path
# Download hg38 reference genome
print("Downloading hg38 reference genome...")
ref_path = get_ref_genome("hg38")
print(f"Reference saved to: {ref_path}")
print(f"File size: {Path(ref_path).stat().st_size / (1024**3):.2f} GB")# Download liftover chain file
print("\nDownloading hg19->hg38 chain file...")
chain_path = get_liftover_chain("hg19", "hg38")
print(f"Chain file saved to: {chain_path}")
print(f"File size: {Path(chain_path).stat().st_size / (1024**2):.2f} MB")Example 6: Complete Pipeline - From GEO to Liftover
A full example showing how to combine multiple bs-cpg functions in a realistic workflow.
# This is a complete example that combines multiple modules
# For demonstration, we'll use the sample data
from bs_cpg.setup import read_sample_cpg
from bs_cpg.liftover_ps import cpg_reads, cpg_percent, liftover_positions
# Step 1: Load data
print("Step 1: Loading CpG data...")
df = read_sample_cpg(["chromosome", "pos"])
print(f" Loaded {len(df)} CpG sites")
# Step 2: Validate against hg19
print("\nStep 2: Validating against hg19...")
reads = cpg_reads(df["chromosome"], df["pos"], "hg19", index_base=1)
valid_percent = cpg_percent(reads)
print(f" {valid_percent:.2f}% are valid CpG sites")
# Step 3: Liftover to hg38
print("\nStep 3: Lifting over to hg38...")
new_chroms, new_pos = liftover_positions(
chromosomes=df["chromosome"],
positions=df["pos"],
genome_from="hg19",
genome_to="hg38",
index_base_from=1
)
mapped_count = sum(1 for c in new_chroms if c is not None)
print(f" {mapped_count}/{len(df)} positions successfully lifted")
# Step 4: Create output dataframe
print("\nStep 4: Creating output dataframe...")
output_df = pd.DataFrame({
"chrom_hg19": df["chromosome"],
"pos_hg19": df["pos"],
"chrom_hg38": new_chroms,
"pos_hg38": new_pos,
})
print(f"\nPipeline complete!")
print(f"Output dataframe columns: {output_df.columns.tolist()}")Tips and Best Practices
Performance Considerations
- Caching: Reference genomes and chain files are cached locally. Subsequent runs use cached files, making them much faster.
- Batch Processing: Process large datasets in chunks to manage memory usage.
- Network Resilience: The package automatically retries failed downloads with exponential backoff.
Common Issues
- Missing htslib: Ensure
bgzipfrom htslib is installed and in your PATH for reference genome processing. - Large Files: Reference genomes can be >1GB. Ensure adequate disk space in your configured data directory.
- Coordinate Systems: Always verify your input coordinates are in the expected base system (0-based or 1-based).
Working with Your Own Data
# Instead of using read_sample_cpg(), load your own data
import pandas as pd
# Load your data
df = pd.read_csv("my_cpg_data.csv")
# Use bs_cpg functions with your data
reads = cpg_reads(df["chromosome"], df["position"], "hg38", index_base=1)