text1<- "We want to write some R code recognises animal or plant family, subfamily, tribe or subtribe names in a piece of text. For animals these always end in -idea, -inae, -ini and -ina respectively. The Noctuidae, Erebidae, Sphingidae and Saturniidae are families of moths, the Lymantriinae is a subfamily of Erebidae, the Lymantriini, Leucomini, Orgyiini and Nygmiini are tribes of the Lymantriinae, etc." text2<-unlist(strsplit(text1," ")) lymantrids<-grep("^Lyman",text2) #lymantrids families<-text2[grep("idae$",text2)] #families temp<-gsub("[[:punct:]]$","",as.character(text2)) #temp families<-temp[grep("idae$",temp)] #families punctuation<-grep("[[:punct:]]$",as.character(text2)) for(i in 1:length(punctuation)){ word<-text2[punctuation [i]] word<-paste(substr(word,1,nchar(word)-1),substr(word,nchar(word), nchar(word)),sep=" ") text2[punctuation [i]]<-word} # end i loop text2<-unlist(strsplit(text2," ")) #text2 newpunct<-grep("^[[:punct:]]$",as.character(text2)) newtext<-text2[1] for (i in 2: length(text2)){ ifelse(i %in% newpunct,newtext<-paste(newtext,text2[i],sep=""), newtext<-paste(newtext," ",text2[i],sep=""))} newtext