misc/pim_search.php

<html>
<div style="{width:400px; position:absolute; right:0px; background-color:#F0F8FF; margin:10px; border:2px solid #ddf; padding:5px;}">
<h2>To do</h2>
<ul>
	<li>update <a href="http://fabien.benetou.fr/Tools/Sphinxsearch">Tools/Sphinxsearch</a> and <a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=pim_search.php">the public repository source</a></li>
	<li>List the non working queries and debug them (use = keyword_to_test for comparison)</li>
	<ul>
		<li><a href="/search/full text search">full text search</a> should return <a href="http://fabien.benetou.fr/Events/RailsCampParis3#FullTextSearch">#FullTextSearch</a></li>
		<li><a href="/search/facebook">facebook</a></li>
		<li><a href="/search/google">google</a></li>
		<li><a href="/search/apple">apple</a></li>
		<li><a href="/search/Brooks">Brooks</a></li>
		<li>... (cf log analysis "?bug&amp;keyword=problematickeyword")</li>
	</ul>
	<li>see also <a href="http://fabien.benetou.fr/Wiki/LearningSearch#ProblematicQueries">ProblematicQueries</a></li>
	<li>make it more generalist via e.g. <a href="http://www.onelook.com/reverse-dictionary.shtml">reverse dictionary</a></li>
	<li>better split documents, e.g. logs per day</li>
	<li>handle the histories (wiki and repository diffs)</li>
	<li>make URLs work, consider specific stemming</li>
	<li>improving weightning and thus ranking</li>
	<li>properly handle remote URLs</li>
	<li>leverage .pageindex of each wiki</li>
	<li>work on snippets</li>
	<li>replace WikiName: by their icon, cf localmap</li>
	<li>add pagerank and other topology metrics as attributes</li>
	<li>use delta or RT indexing, requires better ID generation first though</li>
	<li>add visualization, e.g. network or linked document with the same color</li>
	<li>provide links to specific interesting queries, e.g. only this week</li>
	<ul>
		<li><a href="/search/%22internet%20brain%22~10">"internet brain"~10</a></li>
	</ul>
	<li><a href="http://fabien.benetou.fr/Wiki/LearningSearch">LearningSearch</a></li>
	<li>integrate <a href="http://fabien.benetou.fr/MemoryRecalls/ImprovingPIM#SocialPIM">Social PIM</a>, especially if there are 0 results</li>
	<li>integrate external search, e.g. Seeks, especially if there are 0 results including through Social PIM</li>
	</ul>
</ul>
</div>
<div style="{width:500px; position:absolute; left:0px; background-color:#F8F0FF; margin:10px; border:2px solid #ddf; padding:5px;}">
<?php

# Overall this is very specialized and should be configurable to be used on other PIMs

# this should be tested first and if not present, explain in a line where to download and how to install Sphinx API
require ( "sphinxapi.php" );
require ( "sphinx_doc_ids.php" );

$index = "pmwikis";
$words = $_GET["query"];
if ( isset ($_GET["startingitem"]) )
	$startingitem = $_GET["startingitem"];
else 
	$startingitem = 0;

$image_path = "/devpim/pub/";
$copy_img = "<img src=\"".$image_path."clipboard_add.png"."\" alt=\"copy the page name to the clipboard\"/>";
$star_img = "<img src=\"".$image_path."yellow-star.gif"."\" alt=\"mark that result as significant\"/>";
// this could use some Javascript effect
$alt_img = "<img src=\"".$image_path."server.png"."\" alt=\"use the alternate server (local or remote)\"/>";
$bug_img = "<img src=\"".$image_path."bug.gif"."\" alt=\"mark that query as problematic\"/>";

if ($words == ""){
	print "You need to make an actual search, use ?query=keyword<br/>";
	print "<form action=\"/devpim/pub/search.php\" method=\"get\"><input type=\"text\" name=\"query\"><input type=\"submit\" value=\"search\"/></form><br/>";
	return;
}
if ($startingitem  == "" || $startingitem < 0){
	$startingitem=0;
}

$itemspan=20;

$cl = new SphinxClient ();
$cl->SetLimits($startingitem, $startingitem+$itemspan);
$cl->SetMatchMode(SPH_MATCH_EXTENDED2);
$cl->SetRankingMode(SPH_RANK_PROXIMITY_BM25);
# consider other ranking e.g. pagerank, weighted with freshness and hybrid
$cl->SetSortMode(SPH_SORT_RELEVANCE);
$res = $cl->Query($words,$index);
#cf http://sphinxsearch.com/docs/manual-2.0.1.html#api-funcgroup-querying
#$res = $cl->BuildExcerpts ( $docs, $index, $words, $opts );
if ( !$res ) {
	die ( "ERROR: " . $cl->GetLastError() . ".\n" );
} else {
	// XXX should test for empty result before doing the assigment
	// if (0 results) { $socialsearch = shell_exec("pmwiki_social_search "$keyword"); }
	// ideally this would be done ascynhroneously as it does take some time (few seconds!)
	// yet still always offer it as an option (since it's "costly")
	#var_dump($res);
	$IDs = array_keys($res["matches"]);
	// note that $IDs = array_unique(array_keys($res["matches"])); does not solve the multiple page issue
	// thus probably comes from a duplicate indexing
	print "<h2>Query<a href=\"?bug&keyword=$words\">$bug_img</a></h2>";
	if (isset( $res["words"]["$words"]["hits"])  )
		$hits = $res["words"]["$words"]["hits"]." time(s) ";
	else
		$hits = ""; 
	print "\"$words\" found ".$hits." in ".$res["total_found"]." document(s):<br/>";
	print "<ul>";
	foreach ($IDs as $i)
	{
		#BuildExcerpts()
		#consider here adding snippet, would require few disk access, MySQL storage could be faster...
		print "<li>";
		$file=$idx["$i"];
		if (preg_match('|/home/fabien/irclogs/|',$file)){
			$target = preg_replace('|/home/fabien/irclogs/(.*)|','$1',$file);
			$url = urlencode($target);
			print "<a href=\"https://cloud.benetou.fr/discussions/$url\">IRClogs:$target</a>";
		}
		elseif (preg_match('|/home/fabien/www/mirrors/|',$file)){
			$page = preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','$2/$3',$file);
			$source = preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','$1',$file);
			switch ( $source ) {
				case "saint-maur": $url="http://saint-maur.benetou.fr"; break;
				case "fabien": $url="http://fabien.benetou.fr"; break;
				case "pim": $url="http://www.ourp.im/"; break;
				case "agiwiki": $url="http://www.agi-wiki.org/"; break;
				case "wiki": $url="http://fabien.benetou.fr/innovativ.it/www/HistoricalArchives/Seedea"; break;
				default: $url=$source;
			}
			print "<a href=\"$url/$page\">$source:$page</a>";
			print preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','<a href="http://self/mirrors/$1/$2/$3">'.$alt_img.'</a>',$file);
			print "<a href=\"javascript:[[$page]]\">$copy_img</a>";
		} elseif (preg_match('|/home/fabien/repository/|',$file)){
			print preg_replace('|/home/fabien/repository/(.*)|','<a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=$1">Repository:$1</a>',$file);
		} else {
			print preg_replace('|/home/fabien/www/(.*)/wiki.d/(.*)\.(.*)|','<a href="http://self/$1/$2/$3">$1:$2/$3</a>',$file);
			print preg_replace('|/home/fabien/www/(.*)/wiki.d/(.*)\.(.*)|','<a href="https://cloud.benetou.fr/backups/wiki/$2/$3">'.$alt_img.'</a>',$file);
		}
		print "<a href=\"?star&keyword=$words&result=$file\">$star_img</a>";
		print "</li>";
	}
	$previousitems=$startingitem-$itemspan;
	$nextitems=$startingitem+$itemspan;
	print "</ul>";
	print "<div>";
	if ($previousitems >= 0)
		print "<a href=\"/search/$words/$previousitems\">previous items</a>";
	for ($p=1;$p<$res["total"]/$itemspan;$p++)
		print " <a href=\"/search/$words/".($p-1)*$itemspan."\">$p</a>/";
	if ($nextitems < $res["total"])
		print "<a href=\"/search/$words/$nextitems\">next items</a>";
	print "</div>";

	$indexage = "unknown";
	$targetindexfile = "/var/lib/sphinxsearch/data/pmwikis.spd";
	$updatedindex = filemtime($targetindexfile);
	$indexage = date("c",$updatedindex);
	print "<hr/><center>$indexage index<br/>(if bug check indexer errors).</center>";
}

?>
</div>
Sphinx search interface 14 years ago			`<html>`
			`<div style="{width:400px; position:absolute; right:0px; background-color:#F0F8FF; margin:10px; border:2px solid #ddf; padding:5px;}">`
			`<h2>To do</h2>`
			`<ul>`
associated URL 14 years ago			`<li>update <a href="http://fabien.benetou.fr/Tools/Sphinxsearch">Tools/Sphinxsearch</a> and <a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=pim_search.php">the public repository source</a></li>`
Sphinx search interface 14 years ago			`<li>List the non working queries and debug them (use = keyword_to_test for comparison)</li>`
			`<ul>`
			`<li><a href="/search/full text search">full text search</a> should return <a href="http://fabien.benetou.fr/Events/RailsCampParis3#FullTextSearch">#FullTextSearch</a></li>`
			`<li><a href="/search/facebook">facebook</a></li>`
			`<li><a href="/search/google">google</a></li>`
			`<li><a href="/search/apple">apple</a></li>`
			`<li><a href="/search/Brooks">Brooks</a></li>`
			`<li>... (cf log analysis "?bug&keyword=problematickeyword")</li>`
			`</ul>`
			`<li>see also <a href="http://fabien.benetou.fr/Wiki/LearningSearch#ProblematicQueries">ProblematicQueries</a></li>`
			`<li>make it more generalist via e.g. <a href="http://www.onelook.com/reverse-dictionary.shtml">reverse dictionary</a></li>`
			`<li>better split documents, e.g. logs per day</li>`
			`<li>handle the histories (wiki and repository diffs)</li>`
			`<li>make URLs work, consider specific stemming</li>`
			`<li>improving weightning and thus ranking</li>`
			`<li>properly handle remote URLs</li>`
			`<li>leverage .pageindex of each wiki</li>`
			`<li>work on snippets</li>`
			`<li>replace WikiName: by their icon, cf localmap</li>`
			`<li>add pagerank and other topology metrics as attributes</li>`
			`<li>use delta or RT indexing, requires better ID generation first though</li>`
			`<li>add visualization, e.g. network or linked document with the same color</li>`
			`<li>provide links to specific interesting queries, e.g. only this week</li>`
			`<ul>`
			`<li><a href="/search/%22internet%20brain%22~10">"internet brain"~10</a></li>`
			`</ul>`
			`<li><a href="http://fabien.benetou.fr/Wiki/LearningSearch">LearningSearch</a></li>`
			`<li>integrate <a href="http://fabien.benetou.fr/MemoryRecalls/ImprovingPIM#SocialPIM">Social PIM</a>, especially if there are 0 results</li>`
			`<li>integrate external search, e.g. Seeks, especially if there are 0 results including through Social PIM</li>`
			`</ul>`
			`</ul>`
			`</div>`
			`<div style="{width:500px; position:absolute; left:0px; background-color:#F8F0FF; margin:10px; border:2px solid #ddf; padding:5px;}">`
			`<?php`

			`# Overall this is very specialized and should be configurable to be used on other PIMs`

			`# this should be tested first and if not present, explain in a line where to download and how to install Sphinx API`
			`require ( "sphinxapi.php" );`
			`require ( "sphinx_doc_ids.php" );`

			`$index = "pmwikis";`
			`$words = $_GET["query"];`
			`if ( isset ($_GET["startingitem"]) )`
			`$startingitem = $_GET["startingitem"];`
			`else`
			`$startingitem = 0;`

			`$image_path = "/devpim/pub/";`
clarify icons function 14 years ago			`$copy_img = "<img src=\"".$image_path."clipboard_add.png"."\" alt=\"copy the page name to the clipboard\"/>";`
			`$star_img = "<img src=\"".$image_path."yellow-star.gif"."\" alt=\"mark that result as significant\"/>";`
			`// this could use some Javascript effect`
			`$alt_img = "<img src=\"".$image_path."server.png"."\" alt=\"use the alternate server (local or remote)\"/>";`
			`$bug_img = "<img src=\"".$image_path."bug.gif"."\" alt=\"mark that query as problematic\"/>";`
Sphinx search interface 14 years ago
			`if ($words == ""){`
			`print "You need to make an actual search, use ?query=keyword<br/>";`
			`print "<form action=\"/devpim/pub/search.php\" method=\"get\"><input type=\"text\" name=\"query\"><input type=\"submit\" value=\"search\"/></form><br/>";`
			`return;`
			`}`
			`if ($startingitem == "" \|\| $startingitem < 0){`
			`$startingitem=0;`
			`}`

			`$itemspan=20;`

			`$cl = new SphinxClient ();`
			`$cl->SetLimits($startingitem, $startingitem+$itemspan);`
			`$cl->SetMatchMode(SPH_MATCH_EXTENDED2);`
			`$cl->SetRankingMode(SPH_RANK_PROXIMITY_BM25);`
			`# consider other ranking e.g. pagerank, weighted with freshness and hybrid`
			`$cl->SetSortMode(SPH_SORT_RELEVANCE);`
			`$res = $cl->Query($words,$index);`
			`#cf http://sphinxsearch.com/docs/manual-2.0.1.html#api-funcgroup-querying`
			`#$res = $cl->BuildExcerpts ( $docs, $index, $words, $opts );`
			`if ( !$res ) {`
			`die ( "ERROR: " . $cl->GetLastError() . ".\n" );`
			`} else {`
			`// XXX should test for empty result before doing the assigment`
			`// if (0 results) { $socialsearch = shell_exec("pmwiki_social_search "$keyword"); }`
			`// ideally this would be done ascynhroneously as it does take some time (few seconds!)`
			`// yet still always offer it as an option (since it's "costly")`
			`#var_dump($res);`
			`$IDs = array_keys($res["matches"]);`
clarify icons function 14 years ago			`// note that $IDs = array_unique(array_keys($res["matches"])); does not solve the multiple page issue`
			`// thus probably comes from a duplicate indexing`
Sphinx search interface 14 years ago			`print "<h2>Query<a href=\"?bug&keyword=$words\">$bug_img</a></h2>";`
clarify icons function 14 years ago			`if (isset( $res["words"]["$words"]["hits"]) )`
			`$hits = $res["words"]["$words"]["hits"]." time(s) ";`
			`else`
			`$hits = "";`
			`print "\"$words\" found ".$hits." in ".$res["total_found"]." document(s):<br/>";`
Sphinx search interface 14 years ago			`print "<ul>";`
			`foreach ($IDs as $i)`
			`{`
			`#BuildExcerpts()`
			`#consider here adding snippet, would require few disk access, MySQL storage could be faster...`
			`print "<li>";`
			`$file=$idx["$i"];`
			`if (preg_match('\|/home/fabien/irclogs/\|',$file)){`
			`$target = preg_replace('\|/home/fabien/irclogs/(.*)\|','$1',$file);`
			`$url = urlencode($target);`
			`print "<a href=\"https://cloud.benetou.fr/discussions/$url\">IRClogs:$target</a>";`
			`}`
			`elseif (preg_match('\|/home/fabien/www/mirrors/\|',$file)){`
			`$page = preg_replace('\|/home/fabien/www/mirrors/(.)/wiki.d/(.)\.(.*)\|','$2/$3',$file);`
			`$source = preg_replace('\|/home/fabien/www/mirrors/(.)/wiki.d/(.)\.(.*)\|','$1',$file);`
			`switch ( $source ) {`
			`case "saint-maur": $url="http://saint-maur.benetou.fr"; break;`
			`case "fabien": $url="http://fabien.benetou.fr"; break;`
			`case "pim": $url="http://www.ourp.im/"; break;`
			`case "agiwiki": $url="http://www.agi-wiki.org/"; break;`
			`case "wiki": $url="http://fabien.benetou.fr/innovativ.it/www/HistoricalArchives/Seedea"; break;`
			`default: $url=$source;`
			`}`
			`print "<a href=\"$url/$page\">$source:$page</a>";`
			`print preg_replace('\|/home/fabien/www/mirrors/(.)/wiki.d/(.)\.(.*)\|','<a href="http://self/mirrors/$1/$2/$3">'.$alt_img.'</a>',$file);`
			`print "<a href=\"javascript:[[$page]]\">$copy_img</a>";`
			`} elseif (preg_match('\|/home/fabien/repository/\|',$file)){`
			`print preg_replace('\|/home/fabien/repository/(.*)\|','<a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=$1">Repository:$1</a>',$file);`
			`} else {`
			`print preg_replace('\|/home/fabien/www/(.)/wiki.d/(.)\.(.*)\|','<a href="http://self/$1/$2/$3">$1:$2/$3</a>',$file);`
			`print preg_replace('\|/home/fabien/www/(.)/wiki.d/(.)\.(.*)\|','<a href="https://cloud.benetou.fr/backups/wiki/$2/$3">'.$alt_img.'</a>',$file);`
			`}`
			`print "<a href=\"?star&keyword=$words&result=$file\">$star_img</a>";`
			`print "</li>";`
			`}`
			`$previousitems=$startingitem-$itemspan;`
			`$nextitems=$startingitem+$itemspan;`
			`print "</ul>";`
			`print "<div>";`
			`if ($previousitems >= 0)`
			`print "<a href=\"/search/$words/$previousitems\">previous items</a>";`
			`for ($p=1;$p<$res["total"]/$itemspan;$p++)`
			`print " <a href=\"/search/$words/".($p-1)*$itemspan."\">$p</a>/";`
			`if ($nextitems < $res["total"])`
			`print "<a href=\"/search/$words/$nextitems\">next items</a>";`
			`print "</div>";`

			`$indexage = "unknown";`
			`$targetindexfile = "/var/lib/sphinxsearch/data/pmwikis.spd";`
			`$updatedindex = filemtime($targetindexfile);`
			`$indexage = date("c",$updatedindex);`
			`print "<hr/><center>$indexage index<br/>(if bug check indexer errors).</center>";`
			`}`

			`?>`
			`</div>`