## Tidying dataset ### Preliminaries data<-readLines(textConnection("Aleiodes_bicolor_MRS1008 TATTTTATATTTTTTATTT Aleiodes_praetor_UK_MRS67_ XXXXXXXXXGTTTTATAT Aleiodes_rugulosus_CollHH1599_Norway xxxxxxxxxATTTTGTATTTTTT Aleiodes_seriatus_MRS252_France xxxxxxxxxxxxxxxxxxxxxxxxx Aleiodes_seriatus_MRS254_France xxxxxxxxxxxxxxxxxxxxxxxxxxxxx Aleiodes_seriatus_MRS263_France xxxxxxxxxATTTTATACTTTTTATTTGG Aleiodes_seriatus_MRS264_France xxxxxxxxxATTTTATACTTTTTATTTGG Aleiodes_seriatus_MRS136_France GATATTGGAATTTTATATTT MRS239_Aleiodes_seriatus_Russia xxxxxxxxxGTTTTATACTTCTTATTT Aleiodes_seriatus_MRS222_Germany xxxxxxxxxATTTtaTaCTTTTTATT Aleiodes_sibiricus_MRS313_Sweden xxxxxxxxxxTTTTGTATTTTTTATT Aleiodes_signatus_MRS378_Sweden xxxxxxxxxxxTTTATATTTTTTATT Aleiodes_signatus_MRS712_Sweden GATATTGGTATTTTATATTTTTTA Aleiodes_unipunctator_CollHH1603_Norway xxxxxxxATTTTATATTTTTTATG")) grep("MRS", data) regexpr("MRS", data) unlist(gregexpr(pattern ="MRS[0-9]+",data)) unlist(regmatches(x = data, gregexpr("MRS[0-9]+",text = data))) startwithMRS<-function(b){ mrs<-unlist(regmatches(x = b, gregexpr("MRS[0-9]+",text = b))) z<-gsub(mrs,"",b) z<-paste(mrs,"_",z,sep="") z} #MRSatstart<-unlist(lapply(data, function(x) startwithMRS(x))) mrs<-unlist(regmatches(x = data[3], gregexpr("MRS[0-9]+",text = data[3]))) mrs # mrs==character(0) startwithMRS<-function(b){ mrs<-unlist(regmatches(x = b, gregexpr("MRS[0-9]+",text = b))) if(length(mrs)>0) { # the MRS string occurs on the line z<-gsub(mrs,"",b) # remove the original MRS number occurrence z<-paste(mrs,"_",z,sep="") # paste the MRS number at the beginning } else z<-b # the MRS string does not occur on the line z # z is returned because it is the last value in the function } MRSatstart<-unlist(lapply(data, function(x) startwithMRS(x))) MRSatstart #### replace double_underscores MRSatstart<-gsub("__","_",MRSatstart) #### Use trimws to remove extra spaces after the sequences MRSatstart #### replace the underscore at end of name MRSatstart<-gsub("_ "," ",MRSatstart) #### count characters in names namelen<-NULL for(i in 1:length(MRSatstart)) namelen<-c(namelen,which(strsplit(MRSatstart[i],"")[[1]]==" ")[1]-1) M<-max(namelen) MRSatstart<-gsub("^ *|(?<= ) | *$", "", MRSatstart, perl = TRUE) #replaces multiple spaces with one space for(i in 1:length(MRSatstart)){ extraspaces<-paste(rep(" ",M-namelen[i]+1),collapse="") MRSatstart[i]<-gsub(" ",extraspaces,MRSatstart[i]) } MRSatstart # [1] "MRS1008_Aleiodes_bicolor TATTTTATATTTTTTATTT" # [2] "MRS67_Aleiodes_praetor_UK XXXXXXXXXGTTTTATAT" # [3] "Aleiodes_rugulosus_CollHH1599_Norway xxxxxxxxxATTTTGTATTTTTT" # [4] "MRS252_Aleiodes_seriatus_France xxxxxxxxxxxxxxxxxxxxxxxxx" # [5] "MRS254_Aleiodes_seriatus_France xxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # [6] "MRS263_Aleiodes_seriatus_France xxxxxxxxxATTTTATACTTTTTATTTGG" # [7] "MRS264_Aleiodes_seriatus_France xxxxxxxxxATTTTATACTTTTTATTTGG" # [8] "MRS136_Aleiodes_seriatus_France GATATTGGAATTTTATATTT" # [9] "MRS239_Aleiodes_seriatus_Russia xxxxxxxxxGTTTTATACTTCTTATTT" #[10] "MRS222_Aleiodes_seriatus_Germany xxxxxxxxxATTTtaTaCTTTTTATT" #[11] "MRS313_Aleiodes_sibiricus_Sweden xxxxxxxxxxTTTTGTATTTTTTATT" #[12] "MRS378_Aleiodes_signatus_Sweden xxxxxxxxxxxTTTATATTTTTTATT" #[13] "MRS712_Aleiodes_signatus_Sweden GATATTGGTATTTTATATTTTTTA" #[14] "Aleiodes_unipunctator_CollHH1603_Norway xxxxxxxATTTTATATTTTTTATG"