Multiple Sequence Alignment
The method and database information was from following journal article: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5597096/. The objective was to discover missense variants that could help explain the differences in dentition between Neanderthals and modern humans. This was an attempt to recreate/speculate what was done for the research paper to obtain their results.
Read in the Fastas
The function read.fasta() from the R package ‘seqinr’ was used to read in protein sequence FASTA files.
#read fasta
library(seqinr)
#multiple sequence alignment
library(msa)
#read in fasta using seqinr
altai <- read.fasta("Altai.txt", seqtype="AA",as.string=TRUE)
vindija <- read.fasta("Vindija.txt", seqtype="AA",as.string=TRUE)
sidron <- read.fasta("Sidron.txt", seqtype="AA",as.string=TRUE)
Create a function that takes the gene name and outputs a msa
Part 1: Create a combined fasta using write.fasta
- The reference FASTA files for human, chimpanzee (Pan troglodytes), and gorilla (Gorilla gorilla) protein sequences were obtained from UniProt
- Neanderthal data including Altai pedal phalanx (Altai Mountain Cave, Siberia) and bone fragments of SD1253 (El Sidron cave, Spain) and Vi33.15 (Vindija cave, Croatia) were obtained from the Max Planck Institute for Evolutionary Anthropology: http://cdna.eva.mpg.de/neandertal/exomes/
Part 2: Align the sequences from the combined fasta using the msa package
The ClustalW algorithm was used for the multiple sequence alignment with a BLOSUM matrix. The msaPrettyPrint() function was used to print the sequence in a more readable manner
msa_maker<- function(gene_name, ensg_num, hcg_fasta_name, combined_fasta_name){
#Part 1: Create a combined fasta
#Read in fasta with human, chimp, and gorilla
hcg_fasta <- read.fasta(hcg_fasta_name, seqtype="AA",as.string=TRUE,set.attributes = FALSE)
#changed the original sequence names because they were too long and made the final msa look messy
hcg_names <- sapply(strsplit(names(hcg_fasta), "|", fixed=TRUE),"[[", 3)
#write into new combined fasta
write.fasta(hcg_fasta, names=hcg_names,file.out = combined_fasta_name, open="w", as.string=TRUE)
#find the protein in the Neanderthal Data and write it into the combined fasta file
#Altai
#sequence name in file is in the form of ENSGXXXX_ENSTXXXX, grep used to just search with the ENSG info
altai_index<-grep(ensg_num,attributes(altai)$names)[[1]]
altai_filtered<- altai[altai_index]
#write into new combined fasta
write.fasta(paste(altai_filtered)[1], paste0("altai_",gene_name),file.out = combined_fasta_name, open="a", as.string=TRUE)
#Vindija
vindija_index<-grep(ensg_num,attributes(vindija)$names)[[1]]
vindija_filtered<-vindija[vindija_index]
write.fasta(paste(vindija_filtered)[1], paste0("vindija_", gene_name),file.out = combined_fasta_name, open="a", as.string=TRUE)
#Sidron
sidron_index<-grep(ensg_num,attributes(sidron)$names)[[1]]
sidron_filtered<-sidron[sidron_index]
write.fasta(paste(sidron_filtered)[1], paste0("sidron_", gene_name),file.out = combined_fasta_name, open="a", as.string=TRUE)
#Part 2: align the sequences using multiple sequence alignment
sequences <- Biostrings::readAAStringSet(combined_fasta_name)
msa <-msa::msa(
inputSeqs=sequences,
method = "ClustalW"
)
return(msa)
}
Function that helps find the missense variant from the msa object
Additional computations in R were used to find the positions of missense variants excluding any that were the result of X’s in the sequence.
#why is index of each sequence needed? The msa before can arrange the sequences not necessarily in the order inputted
missense_finder<-function(completed_msa,human_index, gorilla_index, chimp_index, altai_index, vindija_index, sidron_index){
#convert the msa to another format so that I can work with it
convert_msa <- msaConvert(completed_msa,"seqinr::alignment")
missense_list<-c()
#go through entire alignment
for (i in 1:nchar(convert_msa$seq[1])){
#if the human amino acid doesn't match the altai, add to the stored list
if ((substr(convert_msa$seq[human_index],i,i)!=substr(convert_msa$seq[altai_index],i,i))&&(substr(convert_msa$seq[altai_index],i,i)!="X")&&(substr(convert_msa$seq[human_index],i,i)!="-")&&(substr(convert_msa$seq[altai_index],i,i)!="-")){
#stored in the order human, gorilla, chimp, altai, vindija, sidron
amino_list<-c(i,substr(convert_msa$seq[human_index],i,i),substr(convert_msa$seq[gorilla_index],i,i),substr(convert_msa$seq[chimp_index],i,i),substr(convert_msa$seq[altai_index],i,i),substr(convert_msa$seq[vindija_index],i,i),substr(convert_msa$seq[sidron_index],i,i))
missense_list=rbind(missense_list, amino_list)
}
#if human amino acid doesn't match the vindija, add to the stored list
else if ((substr(convert_msa$seq[human_index],i,i)!=substr(convert_msa$seq[vindija_index],i,i))&&(substr(convert_msa$seq[vindija_index],i,i)!="X")&&(substr(convert_msa$seq[human_index],i,i)!="-")&&(substr(convert_msa$seq[vindija_index],i,i)!="-")){
#stored in the order human, gorilla, chimp, altai, vindija, sidron
amino_list<-c(i,substr(convert_msa$seq[human_index],i,i),substr(convert_msa$seq[gorilla_index],i,i),substr(convert_msa$seq[chimp_index],i,i),substr(convert_msa$seq[altai_index],i,i),substr(convert_msa$seq[vindija_index],i,i),substr(convert_msa$seq[sidron_index],i,i))
missense_list=rbind(missense_list, amino_list)
}
#if human amino acid doesn't match the sidron, add to the stored list
else if ((substr(convert_msa$seq[human_index],i,i)!=substr(convert_msa$seq[sidron_index],i,i))&&(substr(convert_msa$seq[sidron_index],i,i)!="X")&&(substr(convert_msa$seq[human_index],i,i)!="-")&&(substr(convert_msa$seq[sidron_index],i,i)!="-")){
#stored in the order human, gorilla, chimp, altai, vindija, sidron
amino_list<-c(i,substr(convert_msa$seq[human_index],i,i),substr(convert_msa$seq[gorilla_index],i,i),substr(convert_msa$seq[chimp_index],i,i),substr(convert_msa$seq[altai_index],i,i),substr(convert_msa$seq[vindija_index],i,i),substr(convert_msa$seq[sidron_index],i,i))
missense_list=rbind(missense_list, amino_list)
}
}
colnames(missense_list)<- c("position","human","gorilla","chimpanzee","altai","vindija", "sidron")
return(missense_list)
}
Find the tooth proteins: Example
Dentin matrix acidic phosphoprotein 1 (DMP1): ENSG00000152592
dmp1_msa<-msa_maker("dmp1", "ENSG00000152592", "uniprot_dmp1.fasta", "combined_dmp1.fasta")
## use default substitution matrix
#pretty print the aligned sequence in a pdf file using the following line
#msaPrettyPrint(dmp1_msa,file=paste0("dmp1",".pdf"),askForOverwrite=FALSE, showLogo="none",showConsensus = "none")
missense_finder(dmp1_msa, 3, 1, 2, 4, 5, 6)
## position human gorilla chimpanzee altai vindija sidron
## amino_list "483" "N" "N" "N" "T" "T" "X"