<?php
Use \yurun\util\chinese;
use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
require_once("demosettings.php");
include_once 'write_log.php';
require_once 'sqlUtils.php';


//include 'sparql.php';

function readEnvelopeFromSocket($socket, $docId) {
    $output = "";
    $begin_time = time();
    $completed_time = NULL;
    $MAXTIMOUT = 6000;
    $ending = false;
    $Completedpos = false;
    socket_set_option($socket, SOL_SOCKET, SO_RCVTIMEO, array("sec" => 600, "usec" => 0));
    do {
        $elapsed = time() - $begin_time;
        if ($elapsed > $MAXTIMOUT) {
            write_to_log("ERROR", "analyze_file max time out reached! where id=" . $docId);
            socket_close($socket);
            return false;
        }
        $read = socket_read($socket, 1024);
        if ($read === false) {
            write_to_log("ERROR", "analyze_file - socket read failed, id is $docId, elapsed is ".(time() - $begin_time)); // . socket_strerror(socket_last_error($socket)));	
			if(strstr(socket_strerror(socket_last_error($socket)), "existing connection was forcibly closed by the remote host")) {
				$output=false;
				$ending=true;
			} else {
				sleep(1);
				continue; //intuscan is still analyzing and therfore does not respond
			}
        } else {
			$output .= $read;
		}
        if (strpos($read, "<Status>Failed</Status>")) {
            write_to_log("ERROR", "analyze_file FAILED where id=" . $docId);
            $output=false;
            $ending=true;
        }

        if ($Completedpos === false) {
            $Completedpos = strpos($output, "<Status>Completed</Status>"); //last envelope containing the result
            continue;
        } elseif ($completed_time === NULL) {
            $completed_time = time();
        }
        if ($Completedpos) { //looking for the closing of the "Completed" envelope
            $ending = strpos($output, "</IVEnvelope>", $Completedpos); 
        }
    } while ($ending === false);
    if ($completed_time != NULL) {
        write_to_log("INFO", ($completed_time - $begin_time) . " seconds took response reading.");
    }
    socket_close($socket);
    return $output;
}

function appendMetaData($content, $data){
    $meta_text = "";
    if(isset($data->user_description)){
        $meta_text .= "User-Description: ".$data->user_description.". \n\n";
    }
    if(isset($data->full_text)){
        $meta_text .= "User-Location: ".$data->user_location.". \n\n";
    }
    return $meta_text.$content;
}

function get_text_content($data, $folderPath){
    //for twitter posts
    if(isset($data->full_text)){
        return $data->full_text;  
    }
    
    if(isset($data->post_text)){
        return $data->post_text;
    }
 
    elseif(isset($data->text)){
        return $data->text;
    }
    elseif(isset($data->Text)){
        return $data->Text;
    }
   
    //for facebook posts
    elseif(isset($data->message)){
        return $data->message;
    }
    elseif(isset($data->originalText)){
        return $data->originalText;
    }
    elseif(isset($data->Description)){
        return $data->Description;
    }
    //otherwise - go to "extradata" folder and look for .txt matching file
    $folderPath_extra = str_replace("text","extradata",$folderPath);
    $data = file_get_contents($folderPath_extra);
    return $data;   
}

function removeGibrish($str){
    global $limit_words_in_text;
    if(empty($limit_words_in_text)){
        return $str;
    }
    $arr = explode(' ', $str);
    $final_text = [];
    foreach($arr as $word){
        if(strlen($word) <= $limit_words_in_text){
            $final_text[] = $word;
        }
    }
    return implode(' ', $final_text);
}

function analayzeFile($mysqli, $contents,$filePath,$username="", $intuscanHost = null,  $intuscanPort = null, $id = null, $type = null, $defaultLanguage = null, $timeoutSec = null,  $folderPath = null, $dest_folder_id = null)
{
    require_once 'vendor/autoload.php';
	global $databaseHost;
	global $databaseUser;
	global $databasePass;
	global $databaseName;
	global $xml_results_path;
	global $ignore_accents;
	global $include_meta_in_text;
	$intuscanHost = $intuscanHost ?? getSystemSettingsProp($mysqli, "intuscanHost");
	$intuscanPort = $intuscanPort ?? getSystemSettingsProp($mysqli, "intuscanPort"); 
	if ($username == ""){
		$username = $_SESSION['username'] ?? null;
	}
	$contentType = "newfile";	
	
	$file_parts =  pathinfo($filePath);

	/**
	 * handle json files
	 */
	if(strtolower($file_parts['extension'])=='json'){   
	    $data = json_decode(getTextNoBOM($contents));
	    if(empty($data)){
	        write_to_log("ERROR", " json_decode of content returned NULL - check for BOM sign");
	        return null;
	    }
	    $contents = get_text_content($data, $folderPath."/".$file_parts["filename"]);
	    
	    if($include_meta_in_text){
	        $contents = appendMetaData($contents, $data);
	    }
	    
	    $defaultLanguage =  property_exists($data, 'lang') ? lang_mapper($data->lang) : null;
	    
	    
	    write_to_log("TRACE", "Handling a json in lang $defaultLanguage"); 
	}

	//if we detect chiniese - translate for pinyin (intuscan support only pinyin)
// 	if(preg_match("/\p{Han}+/u", $contents)){
// 	    $contents = chienseToPinyin($contents);
// 	}
	
	$socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
	if ($socket === false) {
	    write_to_log("ERROR", "analyze_file line - socket_create() failed.\nReason: ($result) " . socket_strerror(socket_last_error($socket)) . "\n");
	    return "down";
	}
	
	$result = socket_connect($socket, $intuscanHost, $intuscanPort);
	if ($result === false) {		
	    write_to_log("WARNING", "analyze_file line 111 - socket_connect() failed.\nReason: ($result) " . socket_strerror(socket_last_error($socket)) . "\n");
	    socket_close($socket);
	    usleep(2000000);
	    $socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);	
	    $result = socket_connect($socket, $intuscanHost, $intuscanPort);
	    if ($result === false) {
		    socket_close($socket);
		    write_to_log("ERROR", "Failed to connecto to intuscan $intuscanHost:$intuscanPort");	
		    return "down";
	    }
	}	
	

	if(strlen($contents) == 0){		
		write_to_log("ERROR", "analyze_file line 35 - contents = 0");
		return "contents error";
	}
	
	//one time hack - ignore accents in text
	if($ignore_accents){ 
	    $transliterator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
	    $contents = $transliterator->transliterate($contents);
	    }
	
	$contents = removeGibrish($contents);
	$base64Contents = base64_encode($contents);	
	
	//$mysqli = new mysqli($databaseHost, $databaseUser, $databasePass, $databaseName);	
	//$modData = $mysqli->real_escape_string($filePath);	
	if ($id == null){
	$query = "SELECT userId FROM files WHERE username = '$username' ORDER BY id DESC LIMIT 1";
	$res = sqlQuery($mysqli, $query);
	if($res != TRUE){
		write_to_log("WARNING", "analyze_file line 45 - sql");
		sleep(1);
		$res = sqlQuery($mysqli, $query);
		if($res != TRUE){
			socket_close($socket);	
			sqlClose($mysqli, 'analyzeFile - fail - sql', true);		
			return "sql";
		}	
	}
		
	$row = $res->fetch_object();	
	if (!isset($row->userId)) //null or doesn't exist
		$userId = 1;
	else 
		$userId =$row->userId + 1;		
	
	$t = date('Y-m-d H:i:s');
	$file_name = $mysqli->real_escape_string(basename($filePath));
	$origFolder = $mysqli->real_escape_string(dirname($filePath));
	
	$query = "INSERT INTO files (userId, username, filename, origFolder, type, date) values ($userId, '$username', '$file_name', '$origFolder', '$contentType', \"$t\")";

	//relevant only for NEW UI - attachment in results
	$params = "";
	if(!empty($_SESSION['attach'])){
	    $params = is_array($_SESSION['attach']) ? json_encode($_SESSION['attach'],true) : $_SESSION['attach'];
	   
	    $params = $mysqli->real_escape_string($params);
	    $query = "INSERT INTO files (userId, username, filename, origFolder, type, date,results) values ($userId, '$username', '$file_name', '$origFolder', '$contentType', \"$t\",'$params')";   
	}

	$res = sqlQuery($mysqli, $query);
	if($res != TRUE){		
		write_to_log("ERROR", "analyze_file line 58 - $query , $mysqli->error");
		socket_close($socket);	
		sqlClose($mysqli, "analyse file", true);		
		return "sql";		
	}
	else{
		$docId = $mysqli->insert_id;

		$sql = createSqlInsertDocPath($mysqli, $docId, dirname($filePath), date(DATE_ATOM), $username);
		sqlMultiQuery($mysqli, $sql);
		if($type != null && $username != ""){
			$isSucceded = updateParentId($mysqli,$username,$type,"singleFiles",$docId);
			if(!$isSucceded)
			{
			    sqlClose($mysqli, "analyse file", true);
			    return "sql";
			}
		}
	}
	}else{
		$docId = $id;		
	}
	//V2 Only - update the dest_folder_id
	if(!empty($dest_folder_id)){
	    sqlQuery($mysqli, "UPDATE files SET parentFolderId=$dest_folder_id WHERE id = $docId");
	    
	    if (isset($_REQUEST['isEmail']) && $_REQUEST['isEmail'])
	    {
	        require_once 'EmailAddressParser.php';
	        $contentType = "";
	        $emailData = fetchEmailData($mysqli, $docId, $contents, $file_name, $docId, $dest_folder_id);
	        $t = isset($emailData["headers"]["date"]) ?  parseDate($emailData["headers"]["date"]) : $t;
	        
	    }
	    else {
	        $emailData = null;
	    }
	}

	$intuscanFilePath = $filePath;
	//patch for enron files with names like "699_" without ".txt" suffix - add ".txt" suffix so intuscan will treat them as txt files
	if (strpos($filePath, ".") === false){
		$intuscanFilePath = $filePath . ".txt";
	}
	elseif(in_array($file_parts['extension'], array("json","split"))){
	    $intuscanFilePath = $filePath . ".txt";
	}
	//TODO: rename defLang to forcedLang or pass an array of config overrides
	$defLangConfOverr = empty($defaultLanguage) ? "" : "<item path=\"/Configuration/Execution/engines/engine[@name='IVPre']/forcedLanguage/@name\" value=\"$defaultLanguage\"/>";
	
	//for report override patch
	$domainsOverRide = [];
	$domainsOverRideStr = "";
	
// 	$settings_class =  new Settings($mysqli, $username);
// 	foreach($settings_class->getFileReportSettings() as $element){
// 	    if($element['present'] == 1){
// 	        foreach($element['values'] as $arr){
// 	            $domainsOverRide[] = $arr['label'];
// 	        }
// 	    }
// 	}
	
	$domainsOverRideStr = implode(",",$domainsOverRide);
	
	$reportOverride = empty($domainsOverRideStr) ? "" : "<item path=\"/Configuration/Execution/engines/engine[@name='IVReport']/reportSections/@name\" value=\"$domainsOverRideStr\"/>";
	
	//$timeoutSec = 2;
	$timeoutConfOverr =    empty($timeoutSec) ? "" : "<item path=\"/Configuration/General/server[@name='timeooutsec'] value=\"$timeoutSec\"/>";
	
	$out = "<IVEnvelope><Request id=\"$docId\"><AnalyzeFile><InputParameters>" .
			"<ConfigOverrides><item path=\"/Configuration/Execution/export/rdf/@expandInstances\" value=\"yes\"/>" . //should use value "directKb" - but then name alternatives are tampered - unknown name components
			$defLangConfOverr .
			$timeoutConfOverr . 
			$reportOverride .
			"</ConfigOverrides>" .	
			"<FileMetaParameters><structured><slot name=\"TerminalSystemId\"><integer>" . $docId . "</integer></slot>" .
			"<slot name=\"FileSize\"><integer>" . strlen($contents) . "</integer></slot></structured></FileMetaParameters><FilePath>" . htmlspecialchars($intuscanFilePath) . "</FilePath><FileContent><Base64>" . $base64Contents . "</Base64>";
	$out .= "</FileContent></InputParameters><OutputParameters><FileText/><FileReport/><FileMetaParameters/><FileTriples/></OutputParameters></AnalyzeFile></Request></IVEnvelope>";
		
	
	if (socket_write($socket, $out, strlen($out)) === false){
		//$res = sqlQuery($mysqli, "UPDATE files set files.session='" . "socket error -" .  socket_last_error() . "'  where id=" . $docId);
		write_to_log("ERROR", "analyze_file  - UPDATE files set files.session='" . "socket error -" .  socket_last_error() . "'  where id=" . $docId);
			//echo "UPDATE files set files.session='" . $output . "'  where id=" . $docId;
		socket_close($socket);	
		return "sql";;
	}
	write_to_log("INFO", "Document was sent to intuscan: " . $filePath . "[$intuscanHost, $intuscanPort]" );
	//allowing 3 sendDocEngAjx.php access IntuScan simultaneously might return the 3rd file after 18 minutes (6 minutes max per file), so we try to wait TWICE 600 sec (10mins+10mins)
	//set_time_limit(700); //must be higher than IntuScan timeout - set to unlimited in demosettings.php
	
	$output = readEnvelopeFromSocket($socket, $docId);
    if($output === false) {
        return "down";
    }
	
	$lastEnvPosition = strrpos($output, "<IVEnvelope>");
	$msg = substr($output, $lastEnvPosition, strlen($output));
	//$repmsg = str_replace("\"", "\\\"", $msg);
	

	write_to_log("TRACE", "writing response to db: " . $filePath);
	
	//$mysqli = new mysqli($databaseHost, $databaseUser, $databasePass, $databaseName);
	//$repmsg = $mysqli->real_escape_string($msg);
	
	//echo $repmsg;
	$e = explode(DIRECTORY_SEPARATOR, $filePath);
	$file_name = $e[sizeof($e)-1];	
	if (!is_dir($xml_results_path)) {
            mkdir_full($xml_results_path, 0777, true);
        }

    $saveResults = file_put_contents($xml_results_path .DIRECTORY_SEPARATOR ."$docId" . "_res.xml", $msg);
	//retrying - in case of network problems
	$saveResultsRetryCount=0;
	while ($saveResults === FALSE)
	{
            if (++$saveResultsRetryCount > 10) {
                break;
            }

            write_to_log("ERROR", "Failed saving IntuScan results file. Sleeping and retrying. count: " . $saveResultsRetryCount);
			sleep(2);
			if (!is_dir($xml_results_path)) {
			    mkdir_full($xml_results_path, 0777, true);
			}
	
        $saveResults = file_put_contents($xml_results_path .DIRECTORY_SEPARATOR ."$docId" . "_res.xml", $msg);
	}
		
	/*$query = "UPDATE files set results='" . $repmsg . "', filename=\"". $file_name . "\" where id=" . $docId;
	
	//echo $query;
	$res = sqlQuery($mysqli, $query);
	$sql_problem = false;
	if($res != TRUE){
		$mysqli->close();
		$mysqli = new mysqli($databaseHost, $databaseUser, $databasePass, $databaseName);
		sleep(5);
		$query = "UPDATE files set results='" . $repmsg . "', filename=\"". $file_name . "\" where id=" . $docId;
		$res = sqlQuery($mysqli, $query);
		if($res != TRUE){					
			$query = "delete from files where id=$docId";
	 		$res = sqlQuery($mysqli, $query);
			$sql_problem = true;
			write_to_log("ERROR", "analyze_file line 142 - delete from files where id=$docId");
			return;
		}		
	}*/
	
	
	//$mysqli->close();
	$arr = array();
	$arr["msg"] = $msg;
	$arr["docId"] = $docId;
		
	write_to_log("TRACE", "finished Analyse file: " . $filePath);
	return $arr;
}

function analyzeFileMadeSimple($contents, $intuscanHost = "127.0.0.1", $intuscanPort = 6000) {
    $contentType = "newfile";

    $socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
    if ($socket === false) {
        write_to_log("ERROR", "analyze_file socket_create() failed.\nReason: ($result) " . socket_strerror(socket_last_error($socket)) . "\n");
        return "down";
    }

    $result = socket_connect($socket, $intuscanHost, $intuscanPort);
    if ($result === false) {
        write_to_log("ERROR", "Failed to connecto to intuscan $intuscanHost:$intuscanPort");
        return "down";
    }

    if (strlen($contents) == 0) {
        write_to_log("ERROR", "analyze_file contents = 0");
        return "contents error";
    }

    $base64Contents = base64_encode($contents);
    $docId = "1";
    $intuscanFilePath = "filename.txt";
    $out = "<IVEnvelope><Request id=\"$docId\"><AnalyzeFile><InputParameters>" .
            "<ConfigOverrides><item path=\"/Configuration/Execution/export/rdf/@expandInstances\" value=\"yes\"/>" .
            "</ConfigOverrides>" .
            "<FileMetaParameters><structured><slot name=\"TerminalSystemId\"><integer>" . $docId . "</integer></slot>" .
            "<slot name=\"FileSize\"><integer>" . strlen($contents) . "</integer></slot></structured></FileMetaParameters><FilePath>" . htmlspecialchars($intuscanFilePath) . "</FilePath><FileContent><Base64>" . $base64Contents . "</Base64>";
    $out .= "</FileContent></InputParameters><OutputParameters><FileText/><FileReport/><FileMetaParameters/><FileTriples/></OutputParameters></AnalyzeFile></Request></IVEnvelope>";


    if (socket_write($socket, $out, strlen($out)) === false) {
        write_to_log("ERROR", "analyze_file  - UPDATE files set files.session='" . "socket error -" . socket_last_error() . "'  where id=" . $docId);
        socket_close($socket);
        return "sql";
    }
    write_to_log("TRACE", "document was sent to intuscan $intuscanHost:$intuscanPort");
    //allowing 3 sendDocEngAjx.php access IntuScan simultaneously might return the 3rd file after 18 minutes (6 minutes max per file), so we try to wait TWICE 600 sec (10mins+10mins)
    //set_time_limit(700); //must be higher than IntuScan timeout - set to unlimited in demosettings.php
    socket_set_option($socket, SOL_SOCKET, SO_RCVTIMEO, array("sec" => 600, "usec" => 0));
    $output = "";
    $begin_time = time();
    $completed_time = NULL;
    while (true) {
        $count = 0;
        $read = socket_read($socket, 1024);
        //write_to_log("INFO", strlen($read) . " bytes read from intuscan - " . $filePath);
        if ($read === false) {
            write_to_log("ERROR", "analyze_file - read = false ");
            socket_close($socket);
            return;
        }
        $output = $output . $read;
        $pos = strpos($output, "<Status>Completed</Status>");
        if (strpos($output, "<Status>Failed</Status>")) {
            write_to_log("ERROR", "analyze_file line 98 - UPDATE files set files.session='" . $output . "'  where id=" . $docId);
            socket_close($socket);
            return;
        }
        if ($pos !== false) {
            if ($completed_time == NULL) {
                $completed_time = time();
            }

            $ending = strpos($output, "</IVEnvelope>", $pos);
            if ($ending !== false) {
                break;
            }
        }
        if ((time() - $begin_time) > 900) {
            write_to_log("ERROR", "analyze_file line 112 - UPDATE files set files.session='time_out " . $output . "'  where id=" . $docId);
            socket_close($socket);
            return "timeout";
        }
    }

    socket_close($socket);
    $lastEnvPosition = strrpos($output, "<IVEnvelope>");
    $msg = substr($output, $lastEnvPosition, strlen($output));
    echo $msg;
}


function chienseToPinyin($data){
    
    $wi= IntlBreakIterator::createWordInstance("zh-Hant-TW");
    $tr= Transliterator::create("Any-Latn");
    $wi->setText($data);
    
    $start = $wi->first();
    for ($end = $wi->next(); $end != IntlBreakIterator::DONE; $start = $end, $end = $wi->next())
    {
        $word= substr($data, $start, $end - $start);
        $ntext= $tr->transliterate($word);
    }
    return $ntext;
}
/**
 * translate chiense to pinyin 
 * @param unknown $data
 * @return string|unknown
 */
function chienseToPinyin2($data){
    ini_set('memory_limit', '1024M');
    require_once 'vendor/autoload.php';
    
    require_once "vendor/fukuball/jieba-php/src/vendor/multi-array/MultiArray.php";
    require_once "vendor/fukuball/jieba-php/src/vendor/multi-array/Factory/MultiArrayFactory.php";
    require_once "vendor/fukuball/jieba-php/src/class/Jieba.php";
    require_once "vendor/fukuball/jieba-php/src/class/Finalseg.php";

    Jieba::init();
    Finalseg::init();
    
    //     Use \yurun\util\chinese;
    require_once 'vendor'.DIRECTORY_SEPARATOR.'yurunsoft'.DIRECTORY_SEPARATOR.'chinese-util'.DIRECTORY_SEPARATOR.'src'.DIRECTORY_SEPARATOR.'Chinese.php';
    $ChineseClass = new Chinese();
    $output_str = "";
    $data_tokens = Jieba::tokenize($data);
    
    foreach($data_tokens as $token_arr){
        $output = $ChineseClass->toPinyin($token_arr['word']);
        if(!empty($output['pinyinSound'])){
            foreach($output['pinyinSound'][0] as $item){
                if(strlen($item) > 1){
                    $output_str .= $item. " ";
                }
                else{
                    $output_str .= $item;
                }
            }
        }
        else{
            $output_str .= $data_sentence;
        }
    }
    return $output_str;
}

