cuttag分析流程(部分存疑)

CASTWJ發表於2024-04-15

參考部落格

https://mp.weixin.qq.com/s/uwO9G_71h8kU3lTWsW_zPw
https://www.jianshu.com/p/1a23656a0713
https://zhuanlan.zhihu.com/p/520071927

資料質控

fastqc -o 0_fastqc/ -f fastq -t 10 --noextract ./0_rawdata/*_1.fq
fastqc -o 0_fastqc/ -f fastq -t 10 --noextract ./0_rawdata/*_2.fq

bowtie2建立索引

wget ftp://ftp.ensembl.org/pub/release-102/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz
gunzip Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz && mv Sus_scrofa.Sscrofa11.1.dna.toplevel.fa pig.fa

bowtie2-build genome/pig.fa genome/pig

slurm提交

#!/bin/bash
#SBATCH --job-name=sbatch_cuttag
#SBATCH --cpus-per-task=40
#SBATCH -o 1_bowtie2.out
#SBATCH --nodelist=comput3

bowtie2比對

比對到豬基因組

cat sample.txt | while read id
do
echo "bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 ./0_rawdata/${id}_1.fq -2 ./0_rawdata/${id}_2.fq -S ./1_bowtie2/${id}.sam &"
done > bowtie2.sh

SpikeIn處理

不確定在豬的cuttag中是否需要進行spikeIn,如果需要,可以參考以下程式碼

比對到大腸桿菌基因組

# 下載大腸桿菌基因組
wget ftp://ftp.ensemblgenomes.org/pub/bacteria/release-48/fasta/bacteria_0_collection/escherichia_coli_str_k_12_substr_mg1655/dna/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz
gunzip Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz && mv Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa ecoli.fa

# 建立索引
bowtie2-build genome/e.coli/ecoli.fa genome/e.coli/ecoli

# 比對
cat sample.txt | while read id
do
echo "bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/e.coli/ecoli -1 ./0_rawdata/${id}_1.fq -2 ./0_rawdata/${id}_2.fq -S ./1_bowtie2/ecoli/${id}.sam & > ${id}_bowtie2_spikeIn.txt"
done > bowtie2_ecoli.sh

獲取測序深度值

while read id
do
{
    seqDepthDouble=`samtools view -F 0x04 ./1_bowtie2/ecoli/${id}.sam | wc -l`
    seqDepth=$((seqDepthDouble/2))
    echo $seqDepth > ./1_bowtie2/ecoli/${id}_bowtie2_spikeIn.seqDepth
} &
done < sample.txt
wait

檔案格式的轉換,獲得bed檔案

while read i
do
{
## 轉換為 BAM 檔案格式,並且僅保留成功對映到參考序列上的讀取
samtools view -bS -F 0x04 ./1_bowtie2/pig/${i}.sam > ./1_mapped/${i}.bam

## 將 BAM 檔案轉換為 bedpe 檔案格式
bedtools bamtobed -i ./1_mapped/${i}.bam -bedpe > ./2_bed/${i}.bedpe
## 將 BAM 檔案轉換為 bed 檔案格式
bedtools bamtobed -i ./1_mapped/${i}.bam > ./2_bed/${i}.bed

## 保留那些在同一條染色體且片段長度小於 1000bp 的雙端 reads
awk '$1==$4 && $6-$2 < 1000 {print $0}' ./2_bed/${i}.bed > ./2_bed/${i}.clean.bed

## 僅提取片段相關的列 
cut -f 1,2,6 ./2_bed/${i}.clean.bed | sort -k1,1 -k2,2n -k3,3n  > ./3_fragments/${i}.fragments.bed
} &
done < sample.txt
wait

評估重複性

## 評估重複性
## 為了研究重複之間和不同條件下的可重複性,基因組被分成 500 bp bin,每個 bin reads 計數的 log2 轉換值的皮爾遜相關性在重複資料集之間計算。多個重複和 IgG 對照資料集顯示在層次化聚類關聯矩陣中。
while read i
do
{
binLen=500
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' ./3_fragments/${i}.fragments.bed \
sort -k1,1V -k2,2n \
uniq -c \
awk -v OFS="\t" '{print $2, $3, $1}' \
sort -k1,1V -k2,2n  > ./4_bin/${i}.fragmentsCount.bin$binLen.bed
} &
done < sample.txt
wait

加入測序深度對bed檔案進行標準化處理

需要有大腸桿菌的測序深度值,如沒有不進行標準化處理

# 標準化
samtools faidx ./genome/pig/pig.fa ##後面的bedtools -g的內容透過samtools的faidx來進行構建獲得索引檔案

# 這裡的seqDepth是前面的資料比對處理當中獲得測序深度值,可以調取txt檔案,來進行後續的迴圈處理
while read i
do
{
# 讀取測序深度值
seqDepth=`cat ./1_bowtie2/ecoli/${i}_bowtie2_spikeIn.seqDepth`
# 根據測序深度值進行標準化處理
if [[ "$seqDepth" -gt "1" ]]; then
scale_factor=`echo "10000 / $seqDepth" | bc -l`
echo "Scaling factor for $histName is: $scale_factor!"
bedtools genomecov -bg -scale $scale_factor -i ./3_fragments/${i}.fragments.bed -g ./genome/pig/pig.fa.fai > ./5_normalized/${i}.fragments.normalized.bedgraph
fi  
} &
done < sample.txt
wait

call peak

# 用SEACR進行peak calling
seacr="seacr.sh"

cat sample.txt | while read i
do
bash $seacr ./5_normalized/${i}.fragments.normalized.bedgraph 0.01 non stringent ./6_peak/${i}_seacr_top0.01.peaks.bed
done

snakemake

shell.executable("/bin/bash")
configfile: "./config.yaml"

SAMPLES=['RD29C36_N501_701', 'RD29C36-N501-702', 'RD29C36-N501-702', 'RD29C36-N501-703', 'RD29C36-N501-703', 'RD29C36-N501-704', 'RD29C36-N501-704', 'RD29C36-N502-701', 'RD29C36-N502-701', 'RD29-N502-702', 'RD29-N502-702', 'RD29-N502-703', 'RD29-N502-703', 'RD29-N502-704', 'RD29-N502-704', 'M2-oocyte-501-704', 'M2-oocyte-501-704', 'M2-oocyte-501-706', 'M2-oocyte-501-706', 'no-oocyte-501-705', 'no-oocyte-501-705', 'RD29C36_N501_701']

rule all:
    input:
        "1_fastqc/multiqc_report.html",
        "genome/pig.complete",
        expand("6_bed/{sample}.clean.bed", sample=SAMPLES),
        expand("7_bigwig/{sample}.bw", sample=SAMPLES)

rule fastqc1:
    input:
        "0_rawdata/{sample}_1.fq.gz",
        "0_rawdata/{sample}_2.fq.gz"
    output:
        "1_fastqc/{sample}_1_fastqc.html",
        "1_fastqc/{sample}_2_fastqc.html"
    shell:
        "fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc1:
    input:
        expand("1_fastqc/{sample}_1_fastqc.html", sample=SAMPLES),
        expand("1_fastqc/{sample}_2_fastqc.html", sample=SAMPLES)
    output:
        "1_fastqc/multiqc_report.html"
    shell:
        "multiqc 1_fastqc/ -o 1_fastqc/"

rule bowtie2build:
    input:
        fa="genome/pig.fa"
    output:
        touch("genome/pig.complete")
    shell:
        "bowtie2-build {input.fa} genome/pig && touch {output}"

rule bowtie2:
    input:
        "0_rawdata/{sample}_1.fq.gz",
        "0_rawdata/{sample}_2.fq.gz"
    output:
        "2_bowtie2/{sample}.sam"
    shell:
        """
        bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 {input[0]} -2 {input[1]} -S {output}
        """

rule sam2bam:
    input:
        "2_bowtie2/{sample}.sam"
    output:
        "3_bam/{sample}.bam"
    shell:
        "samtools view -@ {config[threads]} -F 0x04 -bS {input} > {output}"

rule bam2bed:
    input:
        "3_bam/{sample}.bam"
    output:
        "4_bed/{sample}.bedpe"
    shell:
        "bedtools bamtobed -i {input} -bedpe > {output}"

rule cleanbed:
    input:
        "4_bed/{sample}.bedpe"
    output:
        "5_bed/{sample}.clean.bedpe"
    shell:
        "awk '$1==$4 && $6-$2 < 1000 {{print $0}}' {input} > {output}"

rule bedpe2bed:
    input:
        "5_bed/{sample}.clean.bedpe"
    output:
        "6_bed/{sample}.clean.bed"
    shell:
        "cut -f 1,2,6 {input} | sort -k1,1 -k2,2n -k3,3n  > {output}"

rule bamsort:
    input:
        "3_bam/{sample}.bam"
    output:
        "3_bam/{sample}.sorted.bam"
    shell:
        "samtools sort -@ {config[threads]} -o {output} {input}"

rule bamindex:
    input:
        "3_bam/{sample}.sorted.bam"
    output:
        "3_bam/{sample}.sorted.bam.bai"
    shell:
        "samtools index {input}"

rule bam2bigwig:
    input:
        "3_bam/{sample}.sorted.bam",
        "3_bam/{sample}.sorted.bam.bai"
    output:
        "7_bigwig/{sample}.bw"
    shell:
        "bamCoverage -b {input[0]} -o {output}"

相關文章