From 379744239cfe174fd7a19094e98c267920435278 Mon Sep 17 00:00:00 2001 From: Fabien Benetou Date: Fri, 29 Jul 2011 12:33:24 +0200 Subject: [PATCH] generate keywords and quick solution to handle stopwords --- shell_scripts/pmwiki_keywords_distribution | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 shell_scripts/pmwiki_keywords_distribution diff --git a/shell_scripts/pmwiki_keywords_distribution b/shell_scripts/pmwiki_keywords_distribution new file mode 100755 index 0000000..fc18bce --- /dev/null +++ b/shell_scripts/pmwiki_keywords_distribution @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +WIKIPATH=. + +if [ $# -gt 0 ] +then + WIKIPATH=$1 +fi + +# lots of cool usage to explore +# automatic category creation +# annotated images + +# to facilitate the creation .stopwords run few times +# head ../pub/keywords_distribution/* | sort -n | tail -30 + +for P in `ls $WIKIPATH -IPmWiki.* -I*RecentChanges -Itotalcounter.stat -I*,del-*`; do grep ^text= $P | sed "s/^text=//" | sed "s/%0a/\\n/g" | sed "s/^(:.*:)//g" | sed "s/^>>.*//g" | sed "s/http.*|//g" | sed "s/[^a-zA-Z]/\\n/g" | sort | uniq -i -c | grep -i -v -f $WIKIPATH/.stopwords | sort -r -n > $WIKIPATH/../pub/keywords_distribution/$P.txt; done