misc/pim_search.php

<html>
<div style="{width:400px; position:absolute; right:0px; background-color:#F0F8FF; margin:10px; border:2px solid #ddf; padding:5px;}">
<h2>To do</h2>
<ul>
	<li>update <a href="http://fabien.benetou.fr/Tools/Sphinxsearch">Tools/Sphinxsearch</a> and <a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=pim_search.php">the public repository source</a></li>
	<li>List the non working queries and debug them (use = keyword_to_test for comparison)</li>
	<ul>
		<li><a href="/search/full text search">full text search</a> should return <a href="http://fabien.benetou.fr/Events/RailsCampParis3#FullTextSearch">#FullTextSearch</a></li>
		<li><a href="/search/facebook">facebook</a></li>
		<li><a href="/search/google">google</a></li>
		<li><a href="/search/apple">apple</a></li>
		<li><a href="/search/Brooks">Brooks</a></li>
		<li>... (cf log analysis "?bug&amp;keyword=problematickeyword")</li>
	</ul>
	<li>see also <a href="http://fabien.benetou.fr/Wiki/LearningSearch#ProblematicQueries">ProblematicQueries</a></li>
	<li>make it more generalist via e.g. <a href="http://www.onelook.com/reverse-dictionary.shtml">reverse dictionary</a></li>
	<li>better split documents, e.g. logs per day</li>
	<li>handle the histories (wiki and repository diffs)</li>
	<li>make URLs work, consider specific stemming</li>
	<li>improving weightning and thus ranking</li>
	<li>properly handle remote URLs</li>
	<li>leverage .pageindex of each wiki</li>
	<li>work on snippets</li>
	<li>replace WikiName: by their icon, cf localmap</li>
	<li>add pagerank and other topology metrics as attributes</li>
	<li>use delta or RT indexing, requires better ID generation first though</li>
	<li>add visualization, e.g. network or linked document with the same color</li>
	<li>provide links to specific interesting queries, e.g. only this week</li>
	<ul>
		<li><a href="/search/%22internet%20brain%22~10">"internet brain"~10</a></li>
	</ul>
	<li><a href="http://fabien.benetou.fr/Wiki/LearningSearch">LearningSearch</a></li>
	<li>integrate <a href="http://fabien.benetou.fr/MemoryRecalls/ImprovingPIM#SocialPIM">Social PIM</a>, especially if there are 0 results</li>
	<li>integrate external search, e.g. Seeks, especially if there are 0 results including through Social PIM</li>
	</ul>
</ul>
</div>
<div style="{width:500px; position:absolute; left:0px; background-color:#F8F0FF; margin:10px; border:2px solid #ddf; padding:5px;}">
<?php

# Overall this is very specialized and should be configurable to be used on other PIMs

# this should be tested first and if not present, explain in a line where to download and how to install Sphinx API
require ( "sphinxapi.php" );
require ( "sphinx_doc_ids.php" );

$index = "pmwikis";
$words = $_GET["query"];
if ( isset ($_GET["startingitem"]) )
	$startingitem = $_GET["startingitem"];
else
	$startingitem = 0;

$image_path = "/devpim/pub/";
$copy_img = "<img src=\"".$image_path."clipboard_add.png"."\" alt=\"copy the page name to the clipboard\"/>";
$star_img = "<img src=\"".$image_path."yellow-star.gif"."\" alt=\"mark that result as significant\"/>";
// this could use some Javascript effect
$alt_img = "<img src=\"".$image_path."server.png"."\" alt=\"use the alternate server (local or remote)\"/>";
$bug_img = "<img src=\"".$image_path."bug.gif"."\" alt=\"mark that query as problematic\"/>";

if ($words == ""){
	print "You need to make an actual search, use ?query=keyword<br/>";
	print "<form action=\"/devpim/pub/search.php\" method=\"get\"><input type=\"text\" name=\"query\"><input type=\"submit\" value=\"search\"/></form><br/>";
	return;
}
if ($startingitem  == "" || $startingitem < 0){
	$startingitem=0;
}

$itemspan=20;

$cl = new SphinxClient ();
$cl->SetLimits($startingitem, $startingitem+$itemspan);
$cl->SetMatchMode(SPH_MATCH_EXTENDED2);
$cl->SetRankingMode(SPH_RANK_PROXIMITY_BM25);
# consider other ranking e.g. pagerank, weighted with freshness and hybrid
$cl->SetSortMode(SPH_SORT_RELEVANCE);
$res = $cl->Query($words,$index);
#cf http://sphinxsearch.com/docs/manual-2.0.1.html#api-funcgroup-querying
#$res = $cl->BuildExcerpts ( $docs, $index, $words, $opts );
if ( !$res ) {
	die ( "ERROR: " . $cl->GetLastError() . ".\n" );
} else {
	// XXX should test for empty result before doing the assigment
	// if (0 results) { $socialsearch = shell_exec("pmwiki_social_search "$keyword"); }
	// ideally this would be done ascynhroneously as it does take some time (few seconds!)
	// yet still always offer it as an option (since it's "costly")
	#var_dump($res);
	$IDs = array_keys($res["matches"]);
	// note that $IDs = array_unique(array_keys($res["matches"])); does not solve the multiple page issue
	// thus probably comes from a duplicate indexing
	print "<h2>Query<a href=\"?bug&keyword=$words\">$bug_img</a></h2>";
	if (isset( $res["words"]["$words"]["hits"])  )
		$hits = $res["words"]["$words"]["hits"]." time(s) ";
	else
		$hits = "";
	print "\"$words\" found ".$hits." in ".$res["total_found"]." document(s):<br/>";
	print "<ul>";
	foreach ($IDs as $i)
	{
		#BuildExcerpts()
		#consider here adding snippet, would require few disk access, MySQL storage could be faster...
		print "<li>";
		$file=$idx["$i"];
		if (preg_match('|/home/fabien/irclogs/|',$file)){
			$target = preg_replace('|/home/fabien/irclogs/(.*)|','$1',$file);
			$url = urlencode($target);
			print "<a href=\"https://cloud.benetou.fr/discussions/$url\">IRClogs:$target</a>";
		}
		elseif (preg_match('|/home/fabien/www/mirrors/|',$file)){
			$page = preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','$2/$3',$file);
			$source = preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','$1',$file);
			switch ( $source ) {
				case "saint-maur": $url="http://saint-maur.benetou.fr"; break;
				case "fabien": $url="http://fabien.benetou.fr"; break;
				case "pim": $url="http://www.ourp.im/"; break;
				case "agiwiki": $url="http://www.agi-wiki.org/"; break;
				case "wiki": $url="http://fabien.benetou.fr/innovativ.it/www/HistoricalArchives/Seedea"; break;
				default: $url=$source;
			}
			print "<a href=\"$url/$page\">$source:$page</a>";
			print preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','<a href="http://self/mirrors/$1/$2/$3">'.$alt_img.'</a>',$file);
			print "<a href=\"javascript:[[$page]]\">$copy_img</a>";
		} elseif (preg_match('|/home/fabien/repository/|',$file)){
			print preg_replace('|/home/fabien/repository/(.*)|','<a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=$1">Repository:$1</a>',$file);
		} else {
			print preg_replace('|/home/fabien/www/(.*)/wiki.d/(.*)\.(.*)|','<a href="http://self/$1/$2/$3">$1:$2/$3</a>',$file);
			print preg_replace('|/home/fabien/www/(.*)/wiki.d/(.*)\.(.*)|','<a href="https://cloud.benetou.fr/backups/wiki/$2/$3">'.$alt_img.'</a>',$file);
		}
		print "<a href=\"?star&keyword=$words&result=$file\">$star_img</a>";
		print "</li>";
	}
	$previousitems=$startingitem-$itemspan;
	$nextitems=$startingitem+$itemspan;
	print "</ul>";
	print "<div>";
	if ($previousitems >= 0)
		print "<a href=\"/search/$words/$previousitems\">previous items</a>";
	for ($p=1;$p<$res["total"]/$itemspan;$p++)
		print " <a href=\"/search/$words/".($p-1)*$itemspan."\">$p</a>/";
	if ($nextitems < $res["total"])
		print "<a href=\"/search/$words/$nextitems\">next items</a>";
	print "</div>";

	$indexage = "unknown";
	$targetindexfile = "/var/lib/sphinxsearch/data/pmwikis.spd";
	$updatedindex = filemtime($targetindexfile);
	$indexage = date("c",$updatedindex);
	print "<hr/><center>$indexage index<br/>(if bug check indexer errors).</center>";
}

?>
</div>