PHP Classes

File: bot_recognizer.php

Recommend this page to a friend!
  Classes of Alexander Selifonov   Bot recognizer and dispatcher   bot_recognizer.php   Download  
File: bot_recognizer.php
Role: Class source
Content type: text/plain
Description: Main class module
Class: Bot recognizer and dispatcher
Recognize and handle Web robot by IP or user agent
Author: By
Last change: fixes
Date: 14 years ago
Size: 19,781 bytes
 

Contents

Class file image Download
<?PHP /** * @name bot_recognizer.php : * Class for defining if request comes from search/indexing bot * and performing some action depending on bot name. * @Author Alexander Selifonov <alex (at) selifan [dot] ru> * @Copyright 2009 Alexander Selifonov * @link http://www.selifan.ru * @link http://www.phpclasses.org/browse/author/267915.html * @Version 1.00.001 * @license http://www.opensource.org/licenses/bsd-license.php * PHP required version : 5.x * Last modified: 28.08.2009 */ class CBotRecognizer { const VERSION = '1.00'; const SEARCH_IP_ONLY = 0; const SEARCH_IP_OR_AGENT = 1; # default search mode - both IP and user agent used const SEARCH_AGENT_ONLY = 2; const UNDEFINED_BOT = -1; const MALICIOUS_BOT = -100; private $table_prefix = 'botrec_'; private $searchmode = 1; # search method private $_callbacks = array(); private $_type_callbacks = array(); private $_dbobject = false; private $_dbobjclass = ''; private $_botdata = array(); # if engine is 'file', bot data will be loaded into this array private $_botdefs_file = ''; private $_verbose = false; private $_debugip; private $_debugagent; private $_result = null; private $addedcnt = 0; private $errormessage = ''; private $botwords = array('bot','crawl','spider'); # words for identifying bot by UA substring private $bot_malicious = false; # becomes true if found bot is checked as malicious in our database private $bottype = 0; # what kind is this bot of (0-unspecificied, 1-indexing, 2-email harvesting, etc... private $malicious_handler = ''; private $register_suspects = false; # for future functionality private $_worktime = array(); # time interval when dispathing is active, example : array('03:00', '05:00') function CBotRecognizer($params=array()){ # tableprefix='', $db_object=false) { $this->_botdefs_file = dirname(__FILE__) .'/bot-defs.txt'; if(is_array($params)) { if(isset($params['tableprefix'])) $this->table_prefix = $params['tableprefix']; if(isset($params['dbobject']) && is_object($params['dbobject'])) { $this->_dbobject = $params['dbobject']; $this->_dbobjclass = strtolower( get_class($this->_dbobject) ); if($this->_verbose) echo "passed to Constructor db object ".get_class($this->_dbobject).'<br />'; } if(isset($params['searchmode'])) $this->searchmode = $params['searchmode']; if(isset($params['verbose'])) $this->_verbose = $params['verbose']; if(!empty($params['sourcefile'])) { $this->_botdefs_file = $params['sourcefile']; } if(isset($params['worktime'])) { $this->_worktime = is_string($params['worktime']) ? split('[-,;]',$params['worktime']) : (is_array($params['worktime']) ? $params['worktime'] : array()); } } if(!is_object($this->_dbobject) && file_exists($this->_botdefs_file)) { $this->LoadBotDefinitionsFile(); } } /** * sets IP and/or user agent strring for emulating specific bot * * @param mixed $ip IP-address * @param mixed $agent User-Agent string */ function EmulateBot($ip='', $agent='') { $this->_debugip = $ip; $this->_debugagent = $agent; $this->_result = null; } function SetSearchMode($mode) { $this->searchmode = $mode; $this->_result = null; # retry recognition } /** * Tries to recognize search/spyder bot * @returns char bot name or (UNDEFINED_BOT) (undefined bot) or false (not a bot) */ function GetBotId($ua='',$ip='') { if(!empty($ua) || !empty($ip)) { $this->_result!=null; $this->_debugagent = $ua; $this->_debugip = $ip; } if($this->_result!==null) return $this->_result; $retcode = false; $ipaddr = ($this->_debugip)? $this->_debugip : $_SERVER['REMOTE_ADDR']; if($this->_debugagent) $usragent = $this->_debugagent; else $usragent = isset($_SERVER['HTTP_USER_AGENT']) ? $_SERVER['HTTP_USER_AGENT'] : ''; if(empty($ipaddr)) return ($this->_result=false); # running from CRON or what else, not from client HTTP request if($this->_verbose) echo "-- CBotRecognizer::GetBotId/KT1 for ip=$ipaddr, user-agent: $usragent, dbobject : ",get_class($this->_dbobject),"<br />"; # get integer IP addr representation from "xxx.xxx.xxx.xxx" $ip_x32 = self::FromIpToX32($ipaddr); if(is_object($this->_dbobject)) { #<3-engine> # using SQL engine: if($this->_verbose) echo "searching in DB, by {$this->_dbobjclass}...<br />"; # debug printing $cond = array(); if($this->searchmode <= CBotRecognizer::SEARCH_IP_OR_AGENT) $cond[] = "(($ip_x32) BETWEEN ipfrom AND ipto)"; if($this->searchmode >= CBotRecognizer::SEARCH_IP_OR_AGENT) $cond[] = "(useragent<>'' AND (INSTR('$usragent', useragent)>0))"; $strcond = implode(' OR ',$cond); $query = "SELECT botid,bottype,malicious FROM {$this->table_prefix}bot_definitions WHERE $strcond LIMIT 1"; if($this->_dbobjclass=='cdbengine') { $result = $this->_dbobject->sql_query($query,1,0,0); if(!empty($result[0])) { $retcode = $result[0]; $this->bottype = $result[1]; $this->bot_malicious = $result[2]; } } else { # (substr($this->_dbobjclass,0,7) == 'zend_db') { $this->_dbobject->setFetchMode(Zend_Db::FETCH_OBJ); $result = $this->_dbobject->fetchRow($query, 2); if(isset($result->botid)) { $retcode = $result->botid; $this->bottype = $result->bottype; $this->bot_malicious = $result->malicious; } } if($this->_verbose>1) { echo "--GetBotId in SQL search result: $retcode, type:[{$this->bottype}], malicious: [{$this->bot_malicious}]<br />";} } #<3-engine> elseif(count($this->_botdata)) { #<3-engine> if($this->_verbose>1) { echo "using file engine<br />"; } foreach($this->_botdata as $dta) { $b_bot = false; if($this->searchmode <= CBotRecognizer::SEARCH_IP_OR_AGENT) { $b_bot = ($ip_x32>=$dta[1]) && ($ip_x32<=$dta[2]); } if(!$b_bot && $this->searchmode >= CBotRecognizer::SEARCH_IP_OR_AGENT) { if($dta[3]!='') $b_bot = (stripos($usragent,$dta[3])!==false); } if($b_bot) { $retcode = $dta[0]; # short bot name $this->bottype = isset($dta[4])? $dta[4] : 0; # bot type $this->bot_malicious = isset($dta[5])? $dta[5] : 0; # malicious or not break; } } if($this->_verbose>1) { echo "--GetBotId in filemode search result: ($retcode), type:[{$this->bottype}], malicious:[{$this->bot_malicious}]<br />";} } #<3-engine> # last resort: if one of special words found, return UNDEFINED_BOT (undefined bot): if(!$retcode) { foreach($this->botwords as $botword) { if(stripos($usragent,$botword)!==false) $retcode = CBotRecognizer::UNDEFINED_BOT; } } return $retcode; } function CreateBotDefTable() { if(!is_object($this->_dbobject)) return false; $sqldrop = "DROP TABLE {$this->table_prefix}bot_definitions"; $sqlcreate = "CREATE TABLE {$this->table_prefix}bot_definitions ( recid INT(20) NOT NULL AUTO_INCREMENT, botid CHAR(60) NOT NULL DEFAULT '', ipfrom INT UNSIGNED NOT NULL DEFAULT 0, ipto INT UNSIGNED NOT NULL DEFAULT 0, useragent CHAR(60) NOT NULL DEFAULT '', bottype INT(4) default 0, malicious INT(1) default 0, PRIMARY KEY(recid), KEY ix_ipfrom(ipfrom), KEY ix_ipto(ipto), KEY ix_useragent(useragent) )"; $sqldrop2 = $sqlcreate2 = ''; if($this->register_suspects) { $sqldrop2 = "DROP TABLE {$this->table_prefix}bot_logsuspect"; $sqlcreate2 = "CREATE TABLE {$this->table_prefix}bot_logsuspect ( recid INT(20) NOT NULL AUTO_INCREMENT, botid CHAR(60) NOT NULL DEFAULT '', ipaddr INT UNSIGNED NOT NULL DEFAULT 0, hitcounter INT(10) DEFAULT 0, logstart DATETIME not null DEFAULT 0, PRIMARY KEY(recid), KEY ix_ipaddr(ipaddr) )"; } if($this->_dbobjclass == 'cdbengine') { # use CDBEngine wrapper $this->_dbobject->sql_query($sqldrop); $this->_dbobject->sql_query($sqlcreate); if($this->register_suspects) { $this->_dbobject->sql_query($sqldrop2); $this->_dbobject->sql_query($sqlcreate2); } } else { # use Zend_Db... $this->_dbobject->query($sqldrop); $this->_dbobject->query($sqlcreate); if($this->register_suspects) { $this->_dbobject->query($sqldrop2); $this->_dbobject->query($sqlcreate2); } } if($this->_verbose) echo "CreateBotDefTable: table(s) for bots created<br />"; #debug } /** * loads bot definitions from delimited text file * * @param string $srcfile * @param integer|boolean $clearexisting clean existing data or not (default-not) * @return int count of loaded definitions */ function LoadBotDefinitionsFile($srcfile='', $clearexisting=false) { $mydir = dirname(__FILE__); if(empty($srcfile)) $srcfile = $this->_botdefs_file; if($clearexisting) { if(is_object($this->_dbobject)) { $this->CreateBotDefTable(); } else { $this->_botdata = array(); } } if(file_exists($srcfile)) $lines = @file($srcfile); elseif(file_exists("$mydir/$srcfile")) $lines = @file("$mydir/$srcfile"); # bot def's file may reside in this php class folder if(count($lines)<1) return false; $this->addedcnt = 0; foreach($lines as $line) { #<2> $arr = explode('|', trim($line)); if(count($arr)<4) continue; $botid = trim($arr[0]); $ip1 = self::FromIpToX32(trim($arr[1])); $ip2 = self::FromIpToX32(trim($arr[2])); $agent = trim($arr[3]); $bottype = isset($arr[4])? intval($arr[4]) : 0; $mal = isset($arr[5])? intval($arr[5]) : 0; $this->AddBotDefinition($botid,$ip1,$ip2,$agent, $bottype,$mal); } #<2> if($this->_verbose) echo "LoadBotDefinitionsFile($srcfile) loaded definitions : {$this->addedcnt}<br />"; return $this->addedcnt; } /** * imports bot definitions from internet into local SQL DB * * @param mixed $bot_id bot identifier * @param mixed $url source text file name or url (in iplists.com format) * @param mixed $file_type reserved * @return int */ function ImportBotsFromUrl($bot_id, $url,$file_type=0,$bottype=0, $malicious=0) { $this->errormessage = ''; $this->addedcnt = 0; $ipranges = $uas = array(); $canopen_url = ini_get('allow_url_fopen'); if(!$canopen_url) ini_set('allow_url_fopen',true); $fh = @fopen($url,'r'); if(!$fh) { $this->errormessage = 'Error opening URL or file : '.$url; return false; } while(!feof($fh)) { $line = (fgets($fh)); if(!$line) continue; $line = trim($line); $ua = ''; $strip1 = ''; $strip2= ''; if(substr($line,0,1)=='#') { # comment or # "UA ..." - string with User Agent if(substr($line,0,5) == '# UA ') $uas[] = CBotRecognizer::StrUndress(strtolower(substr($line,5))); continue; } $sip = $strip1 = $strip2 = $line; if(intval($sip)) { #<3> $spl = explode('.',$sip); if(count($spl<4)) { # make full IP range : "74.6.7.0"-"74.6.7.255" from short ip like "74.6.7" $strip1 = $sip. str_repeat('.0',(4-count($spl))); $strip2 = $sip. str_repeat('.255',(4-count($spl))); } if($strip1 != $strip2) { $ipranges[] = array(self::FromIpToX32($strip1), CBotRecognizer::FromIpToX32($strip2)); } else { # <4> try toi find range that can be "widened" for this ip-addr $thisip = self::FromIpToX32($strip1); for($k_ip=0; $k_ip<count($ipranges);$k_ip++) { #<5> if($ipranges[$k_ip][1] == $thisip-1) { $ipranges[$k_ip][1] += 1; # place this IP to found range $strip1 = ''; break; } } #<5> if($strip1) $ipranges[] = array(self::FromIpToX32($strip1), CBotRecognizer::FromIpToX32($strip2)); } #<4> } #<3> } fclose($fh); if(!$canopen_url) ini_set('allow_url_fopen',$canopen_url); # return to "fopen-no-url" mode sort($ipranges); # merge overlapped,adjased and nested IP ranges: $ip2 = array(); for($k_ip=0; $k_ip<count($ipranges);$k_ip++) { #<2> $rng = $ipranges[$k_ip]; $b_add = true; for($k2=0; $k2<count($ip2); $k2++) { #<3> $b_add = true; if($ip2[$k2][0] <= $rng[0] && $ip2[$k2][1] >= $rng[0]) { #<4> overlapped or fully nested if($ip2[$k2][0] <= $rng[1] && $ip2[$k2][1] >= $rng[1]) { #<5> nested - just skip it $b_add=false; break; } #<5> else { #<5> - overlapped, make found range wider to cover this one. $ip2[$k2][1] = $rng[1]; $b_add=false; break; } #<5> } #<4> elseif($ip2[$k2][1]+1 == $rng[0]) { #<4> adjacent ranges, merge $ip2[$k2][1] = $rng[1]; $b_add=false; break; } #<4> } #<3> if($b_add) $ip2[] = $rng; } #<2> # in DB mode - clean from "old" recorfds for this bot id before adding new list if(is_object($this->_dbobject) && $bot_id!='' && count($ip2>0)) { #<2> $cleanqry = "DELETE FROM {$this->table_prefix}bot_definitions WHERE botid='$bot_id'"; if($this->_dbobjclass == 'cdbengine') { $this->_dbobject->sql_query($cleanqry); } else { $this->_dbobject->query($cleanqry); } } for($kk=0; $kk<max(count($ip2),count($uas));$kk++) { $ipfrom = isset($ip2[$kk][0])? $ip2[$kk][0]: 0; $ipto = isset($ip2[$kk][1])? $ip2[$kk][1]: 0; $ua = isset($uas[$kk])? $uas[$kk]: ''; $this->AddBotDefinition($bot_id,$ipfrom,$ipto,$ua,$bottype,$malicious); } if($this->_verbose) { #debug: $added = count($ip2); echo "<h4>$bot_id UA list from $url</h3>"; foreach($uas as $oneua) echo "{$oneua}<br />"; echo "<h4>$bot_id IP list from $url</h3>"; foreach($ip2 as $ip) { $ip4 = $this->FromX32ToIp($ip[0]); $ip4a = $this->FromX32ToIp($ip[1]); echo "IP: $ip4 - $ip4a<br />"; } } return $this->addedcnt; } /** * Adds bot definition into internal array. * Used internally when loading * @param mixed $botid * @param int $ipfrom integer representation of "starting" IP adress * @param int $ipto integer representation of "ending" IP adress * @param string $useragent */ function AddBotDefinition($botid, $ipfrom, $ipto=0, $useragent='', $bottype=0, $mal=0) { if(is_object($this->_dbobject)) { # bot defs in SQL table $bottype = empty($bottype)? '0' : $bottype; $mal = empty($mal)? '0' : $mal; $sql = "INSERT INTO {$this->table_prefix}bot_definitions (botid,ipfrom,ipto,useragent, bottype, malicious)". " VALUES ('$botid','$ipfrom','$ipto','$useragent', $bottype, $mal)"; if($this->_dbobjclass=='cdbengine') { $result = $this->_dbobject->sql_query($sql); } else { $result = $this->_dbobject->query($sql); } } else { # bot defs are in-memory if(is_array($botid)) $this->_botdata[]=$botid; else { if(empty($ipto)) $ipto = $ipfrom; $this->_botdata[] = array($botid,$ipfrom, $ipto, $useragent,$bottype,$mal); } } $this->addedcnt++; } /** * Registers handler function that will be called if some specific bot(s) * recognized * * @param mixed $callbackfnc callback function name * @param mixed $botlist array or [|,;] delimited string with bot id list that will fire this func. */ function SetHandlerForBots($callbackfnc,$botlist) { if(!is_array($botlist)) $botlist = split('[|,;]',$botlist); if(is_array($botlist)) foreach($botlist as $botid) { if(!empty($callbackfnc)) $this->_callbacks[$botid] = $callbackfnc; else unset($this->_callbacks[$botid]); } } /** * sets handler function for some type(s) of bots * * @param mixed $callbackfnc * @param mixed $bottype - integer or array holding bot types that will be handled */ function SetHandlerForTypes($callbackfnc,$bottype) { if(!is_array($bottype)) $bottype = split('[|,;]',$bottype); if(is_array($bottype)) foreach($bottype as $onetype) { if(!empty($callbackfnc)) $this->_type_callbacks[$onetype] = $callbackfnc; else unset($this->_callbacks[$onetype]); } } /** * Sets handler function for all "malicious" bots * * @param string $funcname existing function name */ function SetMaliciousHandler($funcname) { $this->malicious_handler = $funcname; } /** * Dispatch() method tries to recognize the bot and runs respective callback function */ function Dispatch() { if($this->_worktime && count($this->_worktime)>=2 && !empty($this->_worktime[1])) { $curtm = date('H:i'); # if current time out of working interval, don't dispatch if($this->_worktime[0] < $this->_worktime[1] && ($curtm < $this->_worktime[0] || $curtm > $this->_worktime[1])) return; if($this->_worktime[0] > $this->_worktime[1] && ($curtm < $this->_worktime[0] && $curtm > $this->_worktime[1])) return; } $botid = $this->GetBotId(); if(($this->bot_malicious) && !empty($this->malicious_handler) && function_exists($this->malicious_handler)) { call_user_func($this->malicious_handler); } elseif(isset($this->_callbacks[$botid]) && function_exists($this->_callbacks[$botid])) { call_user_func($this->_callbacks[$botid]); } elseif(isset($this->_type_callbacks[$this->bottype]) && function_exists($this->_type_callbacks[$this->bottype])) { call_user_func($this->_type_callbacks[$this->bottype]); } } function IsMaliciousBot() { if($this->_result===null) $this->GetBotId(); return $this->bot_malicious; } function GetBotType() { if($this->_result===null) $this->GetBotId(); return $this->bottype; } function GetErrorMessage() { return $this->errormessage; } /** * converts octet-notation IP addr to intreger * * @param mixed $ipaddr * @return string */ function FromIpToX32($ipaddr) { $iparr = explode('.',$ipaddr); if(count($iparr)<2) return sprintf('%u', $ipaddr); $ip_x32 = ($iparr[0]<<24) + ($iparr[1]<<16) + ($iparr[2]<<8) +$iparr[3]; return sprintf('%u', $ip_x32); } function FromX32ToIp($ipx32) { $ipVal = $ipx32; $ipArr = array(0 => floor( $ipVal/0x1000000) ); $ipVint = $ipVal-($ipArr[0]*0x1000000); $ipArr[1] = ($ipVint & 0xFF0000) >> 16; $ipArr[2] = ($ipVint & 0xFF00 ) >> 8; $ipArr[3] = $ipVint & 0xFF; return implode('.', $ipArr); } /** * "Undresses" string , deleting starting & ending apostrofs if both exist * * @param string $par * @return string */ function StrUndress($par) { if(substr($par,0,1)=='"' && substr($par,-1)=='"') return substr($par,1,strlen($par)-2); if(substr($par,0,1)=="'" && substr($par,-1)=="'") return substr($par,1,length($par)-2); return mysql_real_escape_string($par); } } # CBotRecognizer definition end