Re: Count Ouccrence of words in a long text
- To: mathgroup at smc.vnet.net
- Subject: [mg118984] Re: Count Ouccrence of words in a long text
- From: Ulrich Arndt <ulrich.arndt at data2knowledge.de>
- Date: Wed, 18 May 2011 07:18:37 -0400 (EDT)
Hi, this should do (* get data *) txt = ExampleData[{"Text", "OriginOfSpecies"}]; (* define replace rules for chars / Strings you want to remove *) replacerule = {"." -> "", "," -> "", "!" -> "", "?" -> "", ";" -> ""}; (* remove the chars/strings your are not interessted in *) removed = StringReplace[txt, replacerule]; (* create a world list - first string split by new line afterwards by \ blank " *) wordlist = Flatten[StringSplit[#, " "] & /@ StringSplit[removed, "\n"]]; (* remove whitespace from beginn and end, convert to uppercase, count \ number of occurances per word and sort by number of occurances and \ give 100 most used words *) Sort[Tally[ ToLowerCase[StringTrim[wordlist]]], #1[[2]] > #2[[2]] &][[1 ;; 100]] {{"the", 10104}, {"of", 7262}, {"and", 4338}, {"in", 3891}, {"to", 3550}, {"a", 2416}, {"that", 2056}, {"have", 1759}, {"be", 1652}, {"as", 1552}, {"species", 1432}, {"is", 1411}, {"by", 1317}, {"on", 1239}, {"which", 1221}, {"or", 1176}, {"we", 1152}, {"are", 1129}, {"from", 1117}, {"for", 1093}, {"it", 1051}, {"i", 974}, {"with", 970}, {"this", 956}, {"been", 931}, {"but", 850}, {"not", 844}, {"same", 778}, {"other", 739}, {"will", 735}, {"their", 700}, {"at", 675}, {"some", 654}, {"one", 636}, {"has", 611}, {"more", 573}, {"all", 544}, {"each", 543}, {"they", 540}, {"can", 514}, {"any", 511}, {"so", 509}, {"may", 507}, {"an", 498}, {"these", 494}, {"would", 488}, {"many", 449}, {"when", 411}, {"if", 409}, {"its", 402}, {"most", 401}, {"very", 399}, {"forms", 388}, {"varieties", 384}, {"selection", 375}, {"than", 370}, {"natural", 360}, {"two", 339}, {"between", 328}, {"no", 320}, {"several", 304}, {"there", 293}, {"life", 293}, {"plants", 290}, {"different", 277}, {"our", 277}, {"case", 275}, {"being", 273}, {"thus", 272}, {"see", 269}, {"animals", 268}, {"only", 265}, {"great", 257}, {"those", 257}, {"distinct", 255}, {"having", 251}, {"nature", 246}, {"had", 243}, {"new", 242}, {"do", 238}, {"period", 237}, {"must", 233}, {"could", 233}, {"now", 232}, {"much", 229}, {"under", 221}, {"such", 218}, {"cases", 218}, {"how", 217}, {"believe", 216}, {"during", 215}, {"structure", 210}, {"even", 210}, {"conditions", 208}, {"should", 206}, {"long", 203}, {"genera", 201}, {"yet", 199}, {"into", 199}, {"generally", 198}} In[32]:= (* all in one *) Sort[Tally[ ToLowerCase[ StringTrim[ Flatten[StringSplit[#, " "] & /@ StringSplit[ StringReplace[txt, replacerule], "\n"]]]]], #1[[2]] > #2[[2]] &][[1 ;; 100]] Out[32]= {{"the", 10104}, {"of", 7262}, {"and", 4338}, {"in", 3891}, {"to", 3550}, {"a", 2416}, {"that", 2056}, {"have", 1759}, {"be", 1652}, {"as", 1552}, {"species", 1432}, {"is", 1411}, {"by", 1317}, {"on", 1239}, {"which", 1221}, {"or", 1176}, {"we", 1152}, {"are", 1129}, {"from", 1117}, {"for", 1093}, {"it", 1051}, {"i", 974}, {"with", 970}, {"this", 956}, {"been", 931}, {"but", 850}, {"not", 844}, {"same", 778}, {"other", 739}, {"will", 735}, {"their", 700}, {"at", 675}, {"some", 654}, {"one", 636}, {"has", 611}, {"more", 573}, {"all", 544}, {"each", 543}, {"they", 540}, {"can", 514}, {"any", 511}, {"so", 509}, {"may", 507}, {"an", 498}, {"these", 494}, {"would", 488}, {"many", 449}, {"when", 411}, {"if", 409}, {"its", 402}, {"most", 401}, {"very", 399}, {"forms", 388}, {"varieties", 384}, {"selection", 375}, {"than", 370}, {"natural", 360}, {"two", 339}, {"between", 328}, {"no", 320}, {"several", 304}, {"there", 293}, {"life", 293}, {"plants", 290}, {"different", 277}, {"our", 277}, {"case", 275}, {"being", 273}, {"thus", 272}, {"see", 269}, {"animals", 268}, {"only", 265}, {"great", 257}, {"those", 257}, {"distinct", 255}, {"having", 251}, {"nature", 246}, {"had", 243}, {"new", 242}, {"do", 238}, {"period", 237}, {"must", 233}, {"could", 233}, {"now", 232}, {"much", 229}, {"under", 221}, {"such", 218}, {"cases", 218}, {"how", 217}, {"believe", 216}, {"during", 215}, {"structure", 210}, {"even", 210}, {"conditions", 208}, {"should", 206}, {"long", 203}, {"genera", 201}, {"yet", 199}, {"into", 199}, {"generally", 198}} You might face issues in case the texts you are analyzing are huge. In this case you should considering using mathematica in combination with adb... Ulrich Am 17.05.2011 um 13:47 schrieb Yako: > Hello, > > First of all I am pretty new to Mathematica, so excuse me if this has > a simple answer. > > What I need is to be able to count the occurrence of each word of a > text and count the times each word appears on it. I know how to do > this on other languages but I am trying to achieve it with > mathematica. > > Can someone hint me the way to go? > > Thanks! >