<?php

require_once 'demosettings.php';
if (!defined("useParseTextAndMetadata")) {//if useParseTextAndMetadata is not defined (typically in the crawler) then assume we're running from command line
    echo "usage: php.exe parseTextAndMetadata.php json|csv SRC_DIR DONE_DIR [TEXT_DIR METADATA_DIR]\r\n";
    echo "usage 2: php.exe parseTextAndMetadata.php useDefaults [subFolder]\r\n";
    echo "usage 3: php.exe parseTextAndMetadata.php fromDB DB_NAME (should be defined in \$connectionOverrides)";
    echo "\n";
    if (!IS_CLI) {
        echo "IS_CLI:" . IS_CLI;
        return;
    }

    if ($argv[1] == "fromDB") {
        $srcDir = $sharedFolder . "/src";
        $subFolder = empty($argv[2]) ? "" : "/" . $argv[2] . "/";
        db2src($argv[3]);
        die("Done");
    }

    if ($argv[1] == "useDefaults") {
        $srcDir = $sharedFolder . "/src";
        $doneDir = $sharedFolder . "/Done";
        $subFolder = count($argv) < 3 || empty($argv[2]) ? "" : "/" . $argv[2] . "/";
        $textDir = $uploadedFilesPath . $subFolder;
        $metadataDir = $textDir;//$extraFieldsPath . $subFolder;
    } else {
        $inputFileExtension = $argv[1];
        if ($inputFileExtension != "json" && $inputFileExtension != "csv")
            return;

        $srcDir = $argv[2];
        $doneDir = $argv[3];
        $textDir = $argv[4];
        $metadataDir = $textDir;//$argv[5];
    }

    // Set your timezone.
    date_default_timezone_set('Asia/Amman');

    define("SRC_DIR", $srcDir ? $srcDir : __DIR__ . '\ksa.run');
    define("DONE_DIR", $doneDir ? $doneDir : __DIR__ . '\done');
    if (!is_dir(DONE_DIR)) {
        mkdir(DONE_DIR);
    }

    $ls = array();
    foreach (scandir(SRC_DIR) as $f)
        if ($f != "." && $f != "..")
            $ls[] = SRC_DIR . "/$f";

    for ($i = 0; $i < count($ls); $i++) {
        $file = $ls[$i];
        if (is_dir($file)) {
            foreach (scandir($file) as $f) {
                if ($f != "." && $f != "..")
                    $ls[] = $file . "/$f";
            }
            continue;
        }
        echo "parse: $file . \n";
        if (strpos($file, ".htm") !== FALSE) {
            //for downloading facebook - which is not part of this repository, because these capabilities are sensitive
            require_once $sensitiveCapabilitiesPath . //'\\\\ivlab7\\dev\\apps\\BuildRT\\tools\\facebookTools.php';
                    'tools/crawler/facebookTools.php';
        }
        echo "parsing $file";
        parsefile($file);
        moveFileToDone($file, $srcDir, $doneDir);
    }
    echo "\n";
}

function moveFileToDone($file, $src, $done) {
    $newFile = str_replace($src, $done, $file);
    if (!is_dir(dirname($newFile))) {
        mkdir_full(dirname($newFile), 0777, true);
        
    }
    echo "\r$i: moving $file to $newFile";
    $moved = rename($file, $newFile); //mark file as done
    if (!$moved)
        die("move failed");
}

/* end-of-script */

function getLineAsArray($handle, &$fileHandleSupplements) {
    //if has elements in queue - return the first
    if (!empty($fileHandleSupplements["queue"])) {
        return array_pop($fileHandleSupplements["queue"]);
    }


    if ($fileHandleSupplements["ext"] == "csv") {
        $isFirst = ftell($handle) == 0;
        $cells = fgetcsv($handle);
        global $csvHeaders;
        if ($isFirst) {
            $csvHeaders = $cells;
            $cells = fgetcsv($handle);
        }
        for ($i = 0; $i < count($csvHeaders); $i++)
            $record[str_replace(UTF8_BOM, "", $csvHeaders[$i])] = $cells[$i];

        return $record;
    }

    //json
    if (FALSE === ($line = fgets($handle))) {
        return FALSE;
    }

    //json object in each line
    if (trim($line) != "{" && substr(trim($line), 0, 1) != "[") {
        return json_decode($line, true); // decode the JSON into an associative array
    }

    //json obect spread across multiple lines (indented, starts with "{")
    $allLines = $line;
    while (($line = fgets($handle)) !== FALSE)
        $allLines .= $line;

    //json_decode fails on strings which contain tabs and new lines (e.g. facebook jsons), because it's not JSON standard
    $allLines = preg_replace("/[\t\r\n]+/", " ", $allLines);
    $json = json_decode($allLines, true);
    if (empty($json)) {
        echo "json is empty\r\n";
    }
    if (is_array($json))
        $jsonArray = $json;
    else
        $jsonArray = [$json];

    foreach ($jsonArray as $json) {
        $fileHandleSupplements["queue"][] = $json;
        //add facebook post comments to queue
        if (array_key_exists("comments", $json)) {
            foreach ($json["comments"]["data"] as $comment) {
                $comment["parent_id"] = $json["id"];
                $fileHandleSupplements["queue"][] = $comment;
                echo "\r\nadding comment " . $comment["id"];
            }
        }
    }

    return array_pop($fileHandleSupplements["queue"]);
}

function dateTimeAdjust($folder) {
    //$folder="Date(2014-11-17T20:44:39Z)";
    $folder = preg_replace('/DATE\((.*)\)/i', '\\1', $folder);
    $time = new DateTime($folder);
    return $time->getTimestamp();
}

//input: array of information from one tweet
//save it as text/metadata pair
function getTimeFields() {
    return array("created_at", "time", "postedTime", "Publish Date", "created_time", "PostTime", "created", "contentTime");
}

function parseMetaDataLine($line) {
    $date = "";
    if (array_key_exists("FullFolderPath", $line))
        $folder = $line["FullFolderPath"];
    else {
        $timeFields = getTimeFields();
        //note that facebook posts also have "updated_time", but using it as post's date may cause comments to appear before the post, which is awkward
        foreach ($timeFields as $tf) {
            if (array_key_exists($tf, $line) || array_key_exists($tf, $line["data"])) {
                $folder = $line[$tf] ?? $line["data"][$tf];
                break;
            }
        }
        if (empty($folder)) {
           
            echo "couldn't find time in line: " . substr(json_encode($line), 0, 100);
            return;
        }

        if (preg_match('/DATE\((.*)\)/i', $folder)) {
            $time = dateTimeAdjust($folder);
        } else {
            //we got from TRACX once a 'd/m/y' when m<=12, and 'm/d/y' when m>12, so check for that - but we prefer yyyy-mm-dd
            $dateTime = DateTime::createFromFormat('d/m/Y H:i', $folder);
            $lastErrors = DateTime::getLastErrors();
            if ($dateTime && !$lastErrors["warning_count"]) //if month is greater that 12 - there will be a warning
                $time = $dateTime->getTimestamp();
            else {
                $time = strtotime(str_replace(": ", ":", $folder)); //facebook created_time might have redundant space it time: "...T10: 05: 31"
            }
        }

        if ($time <= 0) {
            echo "bad time: $folder\n";
            return;
        }

        $date = date("F j, Y", $time); //2016-05-01 -> May 1, 2016
        $folder = date("Y/F/j/H", $time);
    }
    
   
    //$folder .= "/" . substr($dateTime[1], 0, 2); //the hour
    if (empty($folder))
        die("empty folder: " . print_r($line, true));
    //echo print_r($folder, true). "\n";
//     saveJsonToText($line, $folder, $date);
    saveJsonToMeta($line, $folder);
}

//input: a file containing json records - one per line
//loop through the lines and save them as text/metadata pair
function parsefile($filename) {
    global $handle;
    $ext = pathinfo($filename, PATHINFO_EXTENSION);
    if ($ext != "csv" && $ext != "json") {
        if ($ext == "htm") {
            global $subFolder;
            global $srcDir;
            $htmlContent = file_get_contents($filename);
            $htmlContent = mb_convert_encoding($htmlContent, "UTF-8");
            $relativePath = str_replace(realpath($srcDir), $subFolder, realpath(dirname($filename)));
            if (strpos($htmlContent, '<html lang="en" id="facebook"'))
                return extractDataFromProfileHtml($relativePath, NULL, $htmlContent);
        }
        die("unsupported extension (csv|json): $ext");
    }

    $handle = fopen($filename, "r");
    $fileHandleSupplements = ["ext" => $ext, "queue" => []];
    while (!feof($handle) || !empty($fileHandleSupplements["queue"])) {
        //printf("\r%d", ftell($handle));
        $json = getLineAsArray($handle, $fileHandleSupplements);
        if ($json === FALSE) {
            echo "\nError reading file: $filename\n";
            break;
        } else if (empty($json)) {
            echo "\ncorrupted line at: " . ftell($handle) . "\n";
            continue;
        }
        if (array_key_exists('fields', $json))
            parseMetaDataLine($json["fields"]); //the line format in 3i:mind
        else
            parseMetaDataLine($json);
    }
    //echo "\n";
    fclose($handle);
}

function getId($record) {
    global $handle; //use handle as a last resort when there is no other id
    $id = "";
    if (array_key_exists("t_id", $record))
        $id = $record['t_id'];
    elseif(array_key_exists("data", $record))
        $id = $record['data']['id'];    
    elseif (array_key_exists("id_str", $record))
        $id = $record['id_str'];
    else if (array_key_exists("id", $record))
        $id = $record['id'];
    //webhose
    else if (array_key_exists("uuid", $record))
        $id = $record['uuid'];
    //SV
    else if (array_key_exists("sourceUID", $record))
        $id = $record['sourceUID'];
    else if (array_key_exists("ID", $record)) {
        $id = $record['ID'];
        if (strpos($id, "E+")) {
            //overcome corrupted id by taking from tracx link
            if (array_key_exists("tracx Link", $record) && strpos($record["tracx Link"], "&interactionID=") > 0) {
                $id = substr($record["tracx Link"], strpos($record["tracx Link"], "&interactionID=") + strlen("&interactionID="));
            } else
                echo "ERROR - bad id: $id";
            if (!is_numeric($id))
                echo "ERROR - non numeric id: $id";
        }
    }
    else if (array_key_exists("TweetId", $record))
        $id = $record['TweetId'];
    //for relativity
    else if (array_key_exists("ArtifactID", $record)) {
        $id = "email" . $record["ArtifactID"] . "/email" . $record["ArtifactID"];
    }
    //last option! invent id if there is none (for csv)
    else {
        $id = ftell($handle);
    }

    if (empty($id)) {
        //die("no id in record");
        echo " ERROR in JsonToText: corrupted line - no id!\n";
        print_r($record);
    }

    //$id = end(explode(":", $id)); //in gnip the id is after ":"
    $id = array_slice(explode(":", $id), -1)[0];

    return $id;
}

//creates the text file per record
function saveJsonToText($record, $folder, $date) {
    global $includeMetaInParseText;
    //$from="From: " . $record[user][screen_name] . "(" . $record[user][name] . ")";
    //$subject=$record[''];
    $text = "";
    if ($includeMetaInParseText) {
        if (!empty($date))
            $text = "Date: $date\r\n";

        if (array_key_exists("Headline", $record))
            $text .= "Subject: $record[Headline]\r\n";

        //need a separate line between date/subject (like email fields), and body
        if (!empty($text))
            $text .= "\r\n\r\n";

        //twitter
        if (array_key_exists("user", $record)) {
            if (array_key_exists("name", $record["user"]))
                $text .= "User: " . $record["user"]["name"] . "\r\n\r\n";
            if (array_key_exists("screen_name", $record["user"]))
                $text .= "Screen name: " . $record["user"]["screen_name"] . "\r\n\r\n";
            if (array_key_exists("description", $record["user"]))
                $text .= "User description: " . $record["user"]["description"] . "\r\n\r\n";
            //make location first - even before date, so SNER won't get mixed up because tweet body contained phrases like "people" which are a hint to a person
            if (array_key_exists("location", $record["user"]))
                $text = "Location: " . $record["user"]["location"] . "\r\n\r\n" . $text;

            $text .= "\r\n";
        }
    }

    foreach (["title", "description", "content"] as $key) {
        if (array_key_exists($key, $record))
            $text .= $record[$key] . "\r\n\r\n";
    }
    if (array_key_exists("Status", $record))
        $text .= " $record[Status] ";
    if (array_key_exists("body", $record))
        $text .= " $record[body] ";
    if (array_key_exists("text", $record))
        $text .= " $record[text] ";
    if (array_key_exists("Content", $record))
        $text .= " $record[Content] "; //each source has a different body
        
    //facebook json
    if (array_key_exists("message", $record))
        $text .= $record["message"];
    //relativity
    if (array_key_exists("ExtractedText", $record))
        $text = $record["ExtractedText"];
    //csv
    if (array_key_exists("tweets", $record))
        $text .= $record["tweets"]; 
        
    $id = getId($record); //each source (3imind/twitter/gnip/tracx) has different id
    
    if (!empty($id)) {
        global $textDir;
        $file = "$textDir/$folder/$id.txt";
        $folder = dirname($file);
        if (!is_dir($folder)) {
            mkdir_full($folder, 0777, true);
            
        }
        $result = file_put_contents($file, "\xEF\xBB\xBF" . $text);
        if ($result === FALSE)
            die("failed to save $file");
        printf("\rsaved file %-50s ", $file);
    } else
        echo "empty id";
}

//creates the metadata file per record
function saveJsonToMeta($record, $folder) {
    $id = getId($record); //each source (3imind/twitter/gnip/tracx) has different id

    if (empty($id)) {
        return;
    }

    global $metadataDir;

    $file = "$metadataDir/$folder/$id.txt.json";
    $folder = dirname($file);
    if (!is_dir($folder)) {
        mkdir_full($folder, 0777, true);
        
    }
    //flatten nested info
    if (array_key_exists('actor', $record) && $record["actor"]["objectType"] == "person")
        $userRecord = $record['actor'];
    else if (array_key_exists('user', $record))
        $userRecord = $record['user']; //gnip vs. twitter
    else if (array_key_exists('from', $record))
        $userRecord = $record['from']; //facebook
    else if (array_key_exists('includes', $record)){ // twitter v2
        if (array_key_exists('users', $record['includes'])){
            $userRecord = $record['includes']['users'][0];
        }
    }

    if (isset($userRecord)) {
        foreach ($userRecord as $key => $value) {
            //echo $key . $value;
            if (is_array($value)) {
                if (array_key_exists("displayName", $value))
                    $record["user_$key"] = $value["displayName"];
                continue;
            }
            if($key == "username" || $key == 'user_name'){
                $record["user_screen_name"] = $value;
                continue;
            }
            $record["user_$key"] = $value;
        }
    }



    if (array_key_exists('place', $record) && is_array($record['place'])) {
        foreach ($record['place'] as $key => $value) {
            if (is_array($value))
                continue;
            $record["place_$key"] = $value;
        }
    }
    if (array_key_exists('coordinates', $record) && isset($record['coordinates']) &&
            array_key_exists('coordinates', $record['coordinates'])) {
        foreach ($record['coordinates']['coordinates'] as $key => $value) {
            if (is_array($value))
                continue;
            $record["coordinates_$key"] = $value;
        }
    }    
    
    //v2 assign text    
    if(array_key_exists("data", $record)){    
        $record["text"] = $record["data"]["text"];
        $record["created_at"] = $record["data"]["created_at"];
    }
    
    //first - fetch the full text of the tweet - if defined - if not - get the text as "full text"
    if (array_key_exists('extended_tweet', $record) && is_array($record['extended_tweet'])) {
        if(isset($record['extended_tweet']["full_text"])){
            $record["full_text"] = $record['extended_tweet']["full_text"];
        }
    }
    else if(array_key_exists('retweeted_status', $record) && array_key_exists('extended_tweet', $record['retweeted_status'])){
        $record["full_text"] = $record['retweeted_status']['extended_tweet']['full_text'];  
    }
    elseif(array_key_exists("data", $record)){
        $record["full_text"] = $record["data"]["text"];
    }
    else{
        $record["full_text"] = $record["text"];
    }
    
    //v2
    if(array_key_exists("includes", $record)){
        if(array_key_exists("tweets", $record["includes"]) && count($record["includes"]["tweets"]) > 1){
            $res_tweet = $record["includes"]["tweets"][1];//first on is the original
            $res_tweet_text = $record["includes"]["tweets"][1]["text"];
    	    $record["full_text"] .= "\n\n@@@ v2 response to Tweet:@@@ ";
    	    $record["full_text"] .= $res_tweet_text;
        }
    }
    
    foreach(["quoted_status","retweeted_status"] as $rqTweet){
        $prefix = ($rqTweet == "retweeted_status") ? "retweeted" : "quoted";
        if (array_key_exists($rqTweet, $record))
        {
        	foreach (["id_str", "quote_count", "reply_count", "retweet_count", "favorite_count"] as $field)
        	{
        	
        	    $curr_key = $prefix."_". $field;
        	    $curr_key = str_replace("_str", "", $curr_key); //retweeted_id_str -> retweeted_id
        	    if(!isset($record[$curr_key]))
        		    continue;
    		    $record[$curr_key] = $record[$rqTweet][$field];
        	}
        	$record[$prefix."_user_screen_name"] = $record[$prefix."_status"]["user"]["screen_name"];
        	if(isset($record[$prefix."_status"]["extended_tweet"]["full_text"])){
        	    $record["full_text_".$prefix] = $record[$prefix."_status"]["extended_tweet"]["full_text"];    
        	
        	    //Also - add the retweet/qouted to the full text
//         	    $record["full_text"] .= "\n\n@@@".ucfirst($prefix) ." Tweet:@@@ ";
//         	    $record["full_text"] .= $record["full_text_".$prefix];
        	}
        }
    }
        
    $newJson = [];
    foreach ($record as $key => $value)
    {
        if (is_array($value) || empty($value))
            continue;
        $newJson[$key] = $value;    	
    }
    //currently only hashtags are allowed as arrays
    if (array_key_exists("entities", $record) && array_key_exists("hashtags", $record["entities"]))
    	foreach ($record["entities"]["hashtags"] as $hastag)
    		$newJson["hashtag"][] = $hastag["text"];

    /*
    //print all meta data to file
    $str = "";
    foreach ($record as $key => $value) {
        if (is_array($value) || empty($value))
            continue;
        $str .= "$key=>$value\r\n";
    }

    $result = file_put_contents($file, "\xEF\xBB\xBF" . $str);
    */
    $result = file_put_contents($file, json_encode($newJson));
    if ($result === FALSE)
        die("Failed to save $file");
}

function beginsWith($str, $sub) {
    return ( substr($str, 0, strlen($sub)) === $sub );
}

function endsWith($str, $sub) {
    return ( substr($str, strlen($str) - strlen($sub)) === $sub );
}

//extract jsons from DB (typically a Relativity DB)
function db2src($dbName) {
    if (!extension_loaded("sqlsrv"))
        die("extension is not loaded: sqlsrv.\r\n" .
                "Download from https://docs.microsoft.com/en-us/sql/connect/php/microsoft-php-driver-for-sql-server\r\n" .
                "Extract php_sqlsrv_7_ts_x86.dll and save in php extension folder\r\n" .
                "and set in php.ini: extension=php_sqlsrv_7_ts_x86.dll\r\n" .
                "In addition, download and install SQL Server ODBC driver: https://www.microsoft.com/en-us/download/details.aspx?id=50420");
    //print_r(get_loaded_extensions());
    global $connectionOverrides;
    global $srcDir;
    global $subFolder;
    if (!array_key_exists($dbName, $connectionOverrides))
        die("'$dbName' not found in \$connectionOverrides");

    $server = $connectionOverrides[$dbName]["host"];
    $username = $connectionOverrides[$dbName]["user"];
    $pwd = $connectionOverrides[$dbName]["pwd"];
    $database = $connectionOverrides[$dbName]["db"];

    $connection = sqlsrv_connect($server, ["PWD" => $pwd, "UID" => $username, "Database" => $database]);
    print_r($connection);
    if ($connection === FALSE)
        print_r(sqlsrv_errors());

    $params = array();
    $options = array("Scrollable" => SQLSRV_CURSOR_KEYSET);

    $stmt = sqlsrv_query($connection, "SELECT ArtifactID, ExtractedText FROM EDDSDBO.Document", $params, $options);
    print_r($stmt);
    if ($stmt === FALSE)
        print_r(sqlsrv_errors());
    $numRows = sqlsrv_num_rows($stmt);
    echo "num rows:" . $numRows;
    if ($numRows === FALSE)
        print_r(sqlsrv_errors());
    while ($row = sqlsrv_fetch_object($stmt)) {
        $jsonStr = json_encode($row);
        if (!$jsonStr)
            die(json_last_error_msg());

        file_put_contents($srcDir . $subFolder . $row->ArtifactID . ".json", $jsonStr);
    }
    sqlsrv_free_stmt($stmt);
    sqlsrv_close($connection);
}

?>