You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
misc/pim_search.php

154 lines
7.4 KiB

<html>
<div style="{width:400px; position:absolute; right:0px; background-color:#F0F8FF; margin:10px; border:2px solid #ddf; padding:5px;}">
<h2>To do</h2>
<ul>
14 years ago
<li>update <a href="http://fabien.benetou.fr/Tools/Sphinxsearch">Tools/Sphinxsearch</a> and <a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=pim_search.php">the public repository source</a></li>
<li>List the non working queries and debug them (use = keyword_to_test for comparison)</li>
<ul>
<li><a href="/search/full text search">full text search</a> should return <a href="http://fabien.benetou.fr/Events/RailsCampParis3#FullTextSearch">#FullTextSearch</a></li>
<li><a href="/search/facebook">facebook</a></li>
<li><a href="/search/google">google</a></li>
<li><a href="/search/apple">apple</a></li>
<li><a href="/search/Brooks">Brooks</a></li>
<li>... (cf log analysis "?bug&amp;keyword=problematickeyword")</li>
</ul>
<li>see also <a href="http://fabien.benetou.fr/Wiki/LearningSearch#ProblematicQueries">ProblematicQueries</a></li>
<li>make it more generalist via e.g. <a href="http://www.onelook.com/reverse-dictionary.shtml">reverse dictionary</a></li>
<li>better split documents, e.g. logs per day</li>
<li>handle the histories (wiki and repository diffs)</li>
<li>make URLs work, consider specific stemming</li>
<li>improving weightning and thus ranking</li>
<li>properly handle remote URLs</li>
<li>leverage .pageindex of each wiki</li>
<li>work on snippets</li>
<li>replace WikiName: by their icon, cf localmap</li>
<li>add pagerank and other topology metrics as attributes</li>
<li>use delta or RT indexing, requires better ID generation first though</li>
<li>add visualization, e.g. network or linked document with the same color</li>
<li>provide links to specific interesting queries, e.g. only this week</li>
<ul>
<li><a href="/search/%22internet%20brain%22~10">"internet brain"~10</a></li>
</ul>
<li><a href="http://fabien.benetou.fr/Wiki/LearningSearch">LearningSearch</a></li>
<li>integrate <a href="http://fabien.benetou.fr/MemoryRecalls/ImprovingPIM#SocialPIM">Social PIM</a>, especially if there are 0 results</li>
<li>integrate external search, e.g. Seeks, especially if there are 0 results including through Social PIM</li>
</ul>
</ul>
</div>
<div style="{width:500px; position:absolute; left:0px; background-color:#F8F0FF; margin:10px; border:2px solid #ddf; padding:5px;}">
<?php
# Overall this is very specialized and should be configurable to be used on other PIMs
# this should be tested first and if not present, explain in a line where to download and how to install Sphinx API
require ( "sphinxapi.php" );
require ( "sphinx_doc_ids.php" );
$index = "pmwikis";
$words = $_GET["query"];
if ( isset ($_GET["startingitem"]) )
$startingitem = $_GET["startingitem"];
else
$startingitem = 0;
$image_path = "/devpim/pub/";
$copy_img = "<img src=\"".$image_path."clipboard_add.png"."\" alt=\"copy the page name to the clipboard\"/>";
$star_img = "<img src=\"".$image_path."yellow-star.gif"."\" alt=\"mark that result as significant\"/>";
// this could use some Javascript effect
$alt_img = "<img src=\"".$image_path."server.png"."\" alt=\"use the alternate server (local or remote)\"/>";
$bug_img = "<img src=\"".$image_path."bug.gif"."\" alt=\"mark that query as problematic\"/>";
if ($words == ""){
print "You need to make an actual search, use ?query=keyword<br/>";
print "<form action=\"/devpim/pub/search.php\" method=\"get\"><input type=\"text\" name=\"query\"><input type=\"submit\" value=\"search\"/></form><br/>";
return;
}
if ($startingitem == "" || $startingitem < 0){
$startingitem=0;
}
$itemspan=20;
$cl = new SphinxClient ();
$cl->SetLimits($startingitem, $startingitem+$itemspan);
$cl->SetMatchMode(SPH_MATCH_EXTENDED2);
$cl->SetRankingMode(SPH_RANK_PROXIMITY_BM25);
# consider other ranking e.g. pagerank, weighted with freshness and hybrid
$cl->SetSortMode(SPH_SORT_RELEVANCE);
$res = $cl->Query($words,$index);
#cf http://sphinxsearch.com/docs/manual-2.0.1.html#api-funcgroup-querying
#$res = $cl->BuildExcerpts ( $docs, $index, $words, $opts );
if ( !$res ) {
die ( "ERROR: " . $cl->GetLastError() . ".\n" );
} else {
// XXX should test for empty result before doing the assigment
// if (0 results) { $socialsearch = shell_exec("pmwiki_social_search "$keyword"); }
// ideally this would be done ascynhroneously as it does take some time (few seconds!)
// yet still always offer it as an option (since it's "costly")
#var_dump($res);
$IDs = array_keys($res["matches"]);
// note that $IDs = array_unique(array_keys($res["matches"])); does not solve the multiple page issue
// thus probably comes from a duplicate indexing
print "<h2>Query<a href=\"?bug&keyword=$words\">$bug_img</a></h2>";
if (isset( $res["words"]["$words"]["hits"]) )
$hits = $res["words"]["$words"]["hits"]." time(s) ";
else
$hits = "";
print "\"$words\" found ".$hits." in ".$res["total_found"]." document(s):<br/>";
print "<ul>";
foreach ($IDs as $i)
{
#BuildExcerpts()
#consider here adding snippet, would require few disk access, MySQL storage could be faster...
print "<li>";
$file=$idx["$i"];
if (preg_match('|/home/fabien/irclogs/|',$file)){
$target = preg_replace('|/home/fabien/irclogs/(.*)|','$1',$file);
$url = urlencode($target);
print "<a href=\"https://cloud.benetou.fr/discussions/$url\">IRClogs:$target</a>";
}
elseif (preg_match('|/home/fabien/www/mirrors/|',$file)){
$page = preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','$2/$3',$file);
$source = preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','$1',$file);
switch ( $source ) {
case "saint-maur": $url="http://saint-maur.benetou.fr"; break;
case "fabien": $url="http://fabien.benetou.fr"; break;
case "pim": $url="http://www.ourp.im/"; break;
case "agiwiki": $url="http://www.agi-wiki.org/"; break;
case "wiki": $url="http://fabien.benetou.fr/innovativ.it/www/HistoricalArchives/Seedea"; break;
default: $url=$source;
}
print "<a href=\"$url/$page\">$source:$page</a>";
print preg_replace('|/home/fabien/www/mirrors/(.*)/wiki.d/(.*)\.(.*)|','<a href="http://self/mirrors/$1/$2/$3">'.$alt_img.'</a>',$file);
print "<a href=\"javascript:[[$page]]\">$copy_img</a>";
} elseif (preg_match('|/home/fabien/repository/|',$file)){
print preg_replace('|/home/fabien/repository/(.*)|','<a href="http://fabien.benetou.fr/repository/?p=.git;a=blob;f=$1">Repository:$1</a>',$file);
} else {
print preg_replace('|/home/fabien/www/(.*)/wiki.d/(.*)\.(.*)|','<a href="http://self/$1/$2/$3">$1:$2/$3</a>',$file);
print preg_replace('|/home/fabien/www/(.*)/wiki.d/(.*)\.(.*)|','<a href="https://cloud.benetou.fr/backups/wiki/$2/$3">'.$alt_img.'</a>',$file);
}
print "<a href=\"?star&keyword=$words&result=$file\">$star_img</a>";
print "</li>";
}
$previousitems=$startingitem-$itemspan;
$nextitems=$startingitem+$itemspan;
print "</ul>";
print "<div>";
if ($previousitems >= 0)
print "<a href=\"/search/$words/$previousitems\">previous items</a>";
for ($p=1;$p<$res["total"]/$itemspan;$p++)
print " <a href=\"/search/$words/".($p-1)*$itemspan."\">$p</a>/";
if ($nextitems < $res["total"])
print "<a href=\"/search/$words/$nextitems\">next items</a>";
print "</div>";
$indexage = "unknown";
$targetindexfile = "/var/lib/sphinxsearch/data/pmwikis.spd";
$updatedindex = filemtime($targetindexfile);
$indexage = date("c",$updatedindex);
print "<hr/><center>$indexage index<br/>(if bug check indexer errors).</center>";
}
?>
</div>