﻿<?php
$logFNsuffix = "-cli";
require __DIR__ . '/vendor/autoload.php';
require_once __DIR__ . '/../../parseUtils.php'; //also includes demosettings
require __DIR__ . '/twitterCredentials.php';
require __DIR__ . '/cli_filter_application_consts.php'; //can be overrided from demosettings
require __DIR__ . '/../../parseTextAndMetadata.php'; // needs useParseTextAndMetadata == true
require __DIR__ . '/cli_filter_application_inits.php';
use Abraham\TwitterOAuth\TwitterOAuth;

use mpyw\Co\CURLException;
use mpyw\Cowitter\Client;
use mpyw\Cowitter\HttpException;

// Set your timezone.
date_default_timezone_set('Asia/Jerusalem');

// Disable timeout.
set_time_limit(0);
// Finish all buffering.
while (ob_get_level()) {
    ob_end_clean();
}

function stream_handler_function_ar($status) {
    // Treat only arab tweets.
    if ($status->lang != "ar" && $status->user->lang != "ar") {
        return;
    }
    stream_handler_function_all($status);
}

//returns true if created a new folder
function keep_json($subfolder, $status, $status_in_json) {//$subfolder must not rely on tweet content b/c we can't rely on stream order
    global $cli_KEEP_JSON, $cli_JSON_BASE_FOLDER;
    if (!$cli_KEEP_JSON) {
        return;
    }
    $newfolder = false;
    $path = "$cli_JSON_BASE_FOLDER/$subfolder";
    if (!is_dir($path)) {
        mkdir($path);
        $newfolder = true;
    }
    
    if (FALSE === file_put_contents("$path/$status->id_str.json", $status_in_json)) {
        die("Unable to save $path/$status->id_str.json");
    }
    return $newfolder;
}

//insert new tweet in json format to db
function keep_json_db($status, $status_in_json) {
    createCrawlerConnection($mysqli);
    
    $value = $mysqli->real_escape_string($status_in_json);
    
    $query = "INSERT into raw_stream_data(element_id, source, data) VALUES('$status->id_str', 1, '$value')"; //source is 1 for twitter
    sqlQueryUnique($mysqli, $query,"Duplicate"); //will insert and prevent looping with this extra flag - the error of duplicated
    sqlClose($mysqli, 'cli_filter');
}

function stream_handler_function_all($status) {
    global $metadataDir, $textDir; //parameters for parseMetaDataLine in parseTextAndMetadata.php
    global $cli_filter_last_folder; //change of this folder trigger the analyze folder
    global $cli_CALL_ParseTextAndMetadata, $cli_TMP_FOLDER;
    global $ivWorkerName;
    global $streamHandlerTimeout;
    if (!isset($status->text)) {
        return; // Treat only tweets.
    }
    
    $subfolder = get_filter_subfolder();
    $textDir = "$cli_TMP_FOLDER/text/$subfolder"; //this is where run_autosendfolder.bsh expects to find the folder
    $metadataDir = "$cli_TMP_FOLDER/extradata/$subfolder";
    
    $status_in_json = json_encode($status);
    
    if (!is_dir($textDir) && !empty($cli_filter_last_folder) && $cli_CALL_ParseTextAndMetadata) {
        upload_folder(1); //run_autosendfolder on text
    } else if (keep_json($subfolder, $status, $status_in_json) && !empty($cli_filter_last_folder)) {
        upload_folder_json($cli_filter_last_folder); //run_filterandparse on the json folder
    }
    
    if ($cli_CALL_ParseTextAndMetadata) {
        $astatus = json_decode($status_in_json, true); //convert to associative array
        parseMetaDataLine($astatus);
    } else {
        printf("\r$status->id_str");
        //include __DIR__ . '/cli_filter_application_conditional_die.php';
        if (isset($streamHandlerTimeout) && $streamHandlerTimeout < new DateTime())
            die("stream_handler_function_all terminated gracefully after reaching timeout!");
    }
    
    $cli_filter_last_folder = $subfolder;
}
 
function fetchThreadPerTweet($status){
    global $comments_min;
    $status_in_json = json_encode($status);
    
    //You will need the id_str and @username of the author of the original tweet you want to find replies to.
    //You should use the Search API for the "@username" of the author.
    //Go through the results looking for the 'in_reply_to_status_id' field to compare to the id_str of the specific tweet you want replies for.
    $id_str = $status->id_str; 
    $screen_name = $status->user->screen_name;
    $url = "https://twitter.com/$screen_name/status/$id_str";
    $res = parse_url($url);
    write_to_log("TRACE", "Working on id: $id_str AND username: $screen_name AND URL: $url");
    $thread = get_conversation($id_str, $screen_name); 
    
    if(count($thread) >$comments_min ){
        keep_json_db($status, $status_in_json);
        
    
        foreach($thread as $post){
            $post_in_json = json_encode($post);
            
            keep_json_db($post, $post_in_json);
        }
    }
}

function get_conversation($id_str, $screen_name, $return_type = 'json', $count = 10000, $result_type = 'mixed', $include_entities = true) {
    require "vendor/autoload.php";
     
    $connection = new TwitterOAuth(tweeterCK, tweeterCS, tweeterAT, tweeterAS);
    
    $params = array(
        'q' => 'to:' . $screen_name, // no need to urlencode this!
        'count' => $count,
        'result_type' => $result_type,
        'include_entities' => $include_entities,
        'since_id' => $id_str
    );
    
    $feed = $connection->get('search/tweets', $params);
    
    //check if rate exceeded
    if(property_exists($feed, "errors")){
        if($feed->errors[0]->code == 88){
            write_to_log("ERROR", $feed->errors[0]->message);
            write_to_log("ERROR", "Sleeping for 600 seconds");
           sleep_with_prompt(600);

           
//          die($feed->errors[0]->message);
        }
    }
    
    $comments = array();
    for ($index = 0; $index < count($feed->statuses); $index++) {
        if ($feed->statuses[$index]->in_reply_to_status_id_str == $id_str) {
            array_push($comments, $feed->statuses[$index]);
        }
    } 
    
   return $comments;
    
}

/**
 * twitter crawler - via DB
 */
function stream_handler_function_all_db($status, $ignoreIsRunning = false) {
    $mysqli = sqlCreateConnection("stream_handler_function_all_db");

    global $streamHandlerTimeout, $comments_min;
    if (!isset($status->text)) {
        return; // Treat only tweets.
    }
    
    //for min comments - fetch the thread of each tweet (or RT or QT of this tweet)
    if($comments_min > 0){ 
        if($status->reply_count > $comments_min){
            fetchThreadPerTweet($status);
        }
        return;
    }

    $status_in_json = json_encode($status);

    keep_json_db($status, $status_in_json);
    
    if(!$ignoreIsRunning){
        printf("\r$status->id_str");
    }
    $query = "SELECT isRunning FROM crawler_settings WHERE prop='crawler'";
    $res = sqlQuery($mysqli, $query);
    $obj = $res->fetch_object();
    if(!$obj->isRunning && !$ignoreIsRunning){
        sqlClose($mysqli, "stream_handler_function_all_db");
        die("Service stopped by DB");
    }
    sqlClose($mysqli, "stream_handler_function_all_db");
    
    if (isset($streamHandlerTimeout) && $streamHandlerTimeout < new DateTime()) {
        die("stream_handler_function_all terminated gracefully after reaching timeout!");
    }
}

function h($str) {
    return htmlspecialchars($str, ENT_QUOTES, 'UTF-8');
}

function mylogin($avoid_print=false) {
    if(!$avoid_print){
        printf("Connecting using access token:%s\n", tweeterAT);
    }
    try {
        // Create a client object
        $client = new Client([
            tweeterCK,
            tweeterCS,
            tweeterAT,
            tweeterAS,
        ]);
        //print_r($client );
    } catch (HttpException $e) {
        echo "\ncURL communication successful but something went wrong with Twitter APIs.\n";
        print_r($e);
    } catch (CURLException $e) {
        echo "\ncURL communication failed.\n";
        print_r($e);
    }
    return $client;
}

function demo_handler_function($status) {
    if (!isset($status->text)) {
        return;
    }
    printf("%s(%s) - %s\n", $status->user->name, $status->user->screen_name, htmlspecialchars_decode($status->text, ENT_NOQUOTES)
        );
}

function filter_app_inits() {
    global $cli_filter_last_folder;
    global $cli_KEEP_JSON, $cli_JSON_BASE_FOLDER, $cli_SINK_FILE;
    $cli_filter_last_folder = "";
    if ($cli_KEEP_JSON && !is_dir($cli_JSON_BASE_FOLDER)) {
        mkdir($cli_JSON_BASE_FOLDER, 0777, true);
    }
    $parent = dirname($cli_SINK_FILE);
    if (!is_dir($parent)) {
        mkdir($parent) || die("unable to create $parent!");
    }
}

//connect to the stream api and call the handler function on each tweet
//When using a SINK_FILE can be gracefully stopped by changing the name of the sink folder
function filter_app_core($url_api, $handler, $params) {
    filter_app_inits();
    while (true) {
        $client = mylogin();
        $start_time = time();
        printf("\nstart stream\n");
        try {
            $client->streaming($url_api, $handler, $params);
        } catch (Exception $e) {

            echo "Stream unexpectedly broke with message: " . $e->getMessage();
        } finally {
            $end_time = time() - $start_time;
            echo "\nfilter_app_core resigned after $end_time seconds! wait for a minute\n";
            sleep(61) === 0 or die("\nCould not go to sleep\n");
        }
    }
}


function filter_app_core_db($url_api, $handler, $params, $mysqli){
    filter_app_inits();
    while (true) {
        $client = mylogin();
        $start_time = time();
        printf("\nstart stream\n");
        try {
            sqlQuery($mysqli, "UPDATE crawler_settings SET isRunning=1,error='' WHERE prop='crawler'");
            $client->streaming($url_api, $handler, $params);
        } catch (Exception $e) {
            $error_msg = "Stream unexpectedly broke with message: " . $e->getMessage();
            sqlQuery($mysqli, "UPDATE crawler_settings SET isRunning=0,error='$error_msg' WHERE prop='crawler'");         
            echo $error_msg;
            write_to_log("ERROR", $error_msg);
            
            if ($e->getMessage() == "No filter parameters found. Expect at least one parameter: follow track locations" ||
                $e->getMessage() == "Exceeded connection limit for user" ) {
                die(); 
            } 
            
        } finally {
            sqlQuery($mysqli, "UPDATE crawler_settings SET isRunning=0 WHERE prop='crawler'");          
            $end_time = time() - $start_time;
            echo "\nfilter_app_core resigned after $end_time seconds! wait for a minute\n";
            sleep(61) === 0 or die("\nCould not go to sleep\n");
        }
    }
}




function filter_app($url_api, $handler) {
    $params = array();
    filter_app_core($url_api, $handler, $params);
}

function filter_app_advance($url_api, $handler) {
    //checks if the jsonfile exists
    $jsonfile = check_def_json("twitter_stream_defs");
    $data = file_get_contents($jsonfile);
    $params = json_decode(file_get_contents($jsonfile), true);
    
    filter_app_core($url_api, $handler, $params);
}

function filter_app_advance_db($url_api, $handler) {
    //checks if the jsonfile exists
    $mysqli = sqlCreateConnection("filter_app_advance_db"); 
    $params = getCrawlerParmas($mysqli);
    
    filter_app_core_db($url_api, $handler, $params, $mysqli);
}

function getCrawlerParmas($mysqli){
    $params = [];
    $sql = "SELECT track, locations, follow, languages FROM crawler_settings WHERE prop=\"crawler\" ";
    $res = sqlQuery($mysqli, $sql);
    $arr = $res->fetch_assoc();
    foreach($arr as $key=>$element){
        if(!empty($element)){
           $params[$key] = $element;
        }
    }
    
    return $params;
    
    
}

function sleep_with_prompt($timetowait, $mysqli = null) {
    while ($timetowait-- > 0) {
        sleep(1) === 0 or die("\nCould not go to sleep\n");
        if($mysqli && $timetowait % 10 == 0){
            checkIsRunning($mysqli);
        }
        printf("\rWaiting %04s", $timetowait);
    }
}

function upload_folder($sectowait) {
    global $cli_BASH_Path;
    $start_time = time();
    $errorlevels = [];
    echo "\nUploading folder to intuscan - wait $sectowait seconds until the next window\n";
    //set the var below to true in local config for old behavior (bash file). //TODO: remove old behavior
//     global $useObsoleteRunAutoSendFolderBash;
//     if ($useObsoleteRunAutoSendFolderBash) {
//         $errorlevel = executeCommand(__DIR__, "START /b " . $cli_BASH_Path . " ./run_autosendfolder.bsh ");
//         $errorlevels[] = $errorlevel;
//     } 
//     else
    perform_uploadFolder($errorlevels);
    
    $end_time = time();
    $elapsed = $end_time - $start_time;
    echo count($errorlevels) . " Auto Send Folder(s) returned " . join(",", $errorlevels) . " after $elapsed seconds\n"; //should be 0 if all O.K.
    $timetowait = max(1, $sectowait - $elapsed);
    sleep_with_prompt($timetowait);
    echo "\n";
}


function perform_uploadFolder(&$errorlevels,$from=null)
{
    global $cli_TMP_FOLDER;
    global $uploadedFilesPath;
    
    if(! empty($from)){
        $cli_TMP_FOLDER =$from;
    }
    
    if(!is_dir("$cli_TMP_FOLDER/text")) {
        return;
    }
    

    if (!is_dir($uploadedFilesPath)){
        mkdir($uploadedFilesPath, 0777, true);
    }
    
    
    foreach (scandir("$cli_TMP_FOLDER/text") as $subDir) {
        $txtDir = "$cli_TMP_FOLDER/text/$subDir";
        //$extradataDir = "$cli_TMP_FOLDER/extradata/$subDir";
        if ($subDir == "." || $subDir == ".." || !is_dir($txtDir)) {
            continue;
        }
        $cmd = OSSpecific::getInstance()->copyFolder($txtDir, $uploadedFilesPath.DIRECTORY_SEPARATOR.$subDir);
      
        //copy text/extradata from cli-tmp to uploaded files path
        //system("robocopy \"$extradataDir\" \"$extraFieldsPath/$subDir\" /E /NFL /NDL /NJH /NJS");
        if (!is_dir("$uploadedFilesPath/$subDir")) {
            write_to_log("ERROR", "Failed to copy $subDir from $cli_TMP_FOLDER to $uploadedFilesPath or from/to extradata folder");
            continue;
        }
        //remove text/extradata from tmp
        $cmd = OSSpecific::getInstance()->rmDir($txtDir);
        
        //executeCommand(NULL, "rmdir /s /q \"$extradataDir\"");
       $errorlevels[] = asyncRequst::getInstance()->runPhpPage("tools/cowitter/autoSendFolder.php", [$subDir], false);

    }
}

function upload_folder_json($subfolder) {
    global $cli_SINK_FILE;
    if (FALSE === file_put_contents($cli_SINK_FILE, "$subfolder\n", FILE_APPEND)) {
        throw new Exception("Failed to add folder to upload list!");
    }
}

function follow_handler_function($status) {
    global $metadataDir, $textDir; //parameters for parseMetaDataLine in parseTextAndMetadata.php
    global $last_id;
    /* @var $cli_CALL_ParseTextAndMetadata bool */
    global $cli_CALL_ParseTextAndMetadata, $cli_TMP_FOLDER;
    
    // Treat only tweets.
    if (!isset($status->text)) {
        return;
    }
    
    $subfolder = date('Y-m-d-G'); //this is necessary because once in an hour we move the folder to analyze
    $textDir = "$cli_TMP_FOLDER/text/$subfolder"; //this is where run_autosendfolder.bsh expects to find the folder
    $metadataDir = "$cli_TMP_FOLDER/extradata/$subfolder";
    $last_id = max($status->id_str, $last_id);
    $status_in_json = json_encode($status);
    keep_json($subfolder, $status, $status_in_json);
    
    if ($cli_CALL_ParseTextAndMetadata) {
        $astatus = json_decode($status_in_json, true); //convert to associative array
        parseMetaDataLine($astatus);
    } else {
        printf("\r$status->created_at");
    }
}

function save2text($statuses) {
    foreach ($statuses as $status) {
        follow_handler_function($status);
    }
}

function load_users() {
    global $projectConfigFolder;
    $jsonfile = "$projectConfigFolder/resources/user_screen_names.json";
    
    if (!file_exists($jsonfile)) {
        die("File $jsonfile does not exists");
    }
    
    $users2follow = json_decode(file_get_contents($jsonfile));

    $users2follow = array_values( array_unique( $users2follow, SORT_REGULAR ) );
    
    echo "\nLoaded " . sizeof($users2follow) . " users ids\n";
    return $users2follow;
}


/**
 * get the user full-archive history - get the tweets in the timeline
 */
function getUserHistory($user, $lang, $from_date, $to_date, $max_results = 500){
    $results = curlGetHistory($user, $lang, $max_results, $from_date);
    
    return $results;  
}

function curlGetHistory($user, $lang, $max_results, $from_date, $next_token = null){
    $ch = curl_init();
    if(!empty($next_token)){ 
        $next_token = ", \"next\": \"$next_token\"";
    }
    else{
        $next_token = ""; 
    } 
    $json_curl = "{\"query\":\"from:$user lang:$lang\",
                        \"maxResults\": \"$max_results\",
                         \"fromDate\":\"$from_date\" $next_token}";
    
    curl_setopt($ch, CURLOPT_URL, 'https://api.twitter.com/1.1/tweets/search/fullarchive/prod.json');
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $json_curl);
    $headers = array();
    $headers[] = 'Authorization: Bearer '.tweeterBK;
    $headers[] = 'Content-Type: application/json';
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    
    $results = curl_exec($ch);
    if (curl_errno($ch)) {
        echo 'Error:' . curl_error($ch);
    }
    $res = json_decode($results);
    
    if($res->next){
        $next_res = curlGetHistory($user, $lang, $max_results, $from_date, $res->next);
        if(!empty($next_res)){
            $res->results = array_merge($next_res, $res->results);
        }
    }
    
    curl_close($ch);
    return $res->results;
}

function load_users_old() {
    if (!file_exists("list-of-followers-id.json")) {
        die("File \"list-of-followers-id.json\" does not exists");
    }
    
    $users2follow = json_decode(file_get_contents("list-of-followers-id.json"));
    echo "\nLoaded " . sizeof($users2follow) . " users ids\n";
    return $users2follow;
}


function follow_app() {
    global $last_id, $cli_RATE_LIMIT, $cli_TWITTER_RATE_LIMIT_WINDOW;
    $client = mylogin();
    $users2follow = load_users();
    
    $parameters = array();
    $parameters['count'] = 10;
    if (!empty($last_id)) {
        $parameters['since_id'] = "$last_id"; //taken from cli_filter_application_inits.php
    }
    
    echo "\nGet with last_id=$last_id\n";
    $loop = $cli_RATE_LIMIT;
    foreach ($users2follow as $user) {
        if (--$loop < 1) {
            upload_folder(60 * $cli_TWITTER_RATE_LIMIT_WINDOW); //wait for 15 minutes
            $loop = $cli_RATE_LIMIT;
        }
        $parameters['user_id'] = "$user";
        try {
            // Get tweets on your home timeline
            $statuses = $client->get('statuses/user_timeline', $parameters);
        } catch (HttpException $e) {
            echo "\nException in user: $user - " . $e->getMessage() . "\n";
//             print_r($e);
//             upload_folder(60 * $cli_TWITTER_RATE_LIMIT_WINDOW);
        } catch (CURLException $e) {
            echo "\ncURL communication failed.\n";
            print_r($e);
        }
//         echo "\nGot:" . sizeof($statuses) . " statuses from user:$user loop:$loop\n";
        //         //if we fetching thread - act as stream
        foreach($statuses->results as $status){
            stream_handler_function_all_db($status);
        }
    }
    
    $NEW_INITS = "<?php \$last_id=\"$last_id\"; ?>";
    file_put_contents("cli_filter_application_inits.php", $NEW_INITS); //for later resurrection
    
    upload_folder(1); //run_autosendfolder
}

function fetchUserHistory($user){ 
    $client = mylogin(true);
    
    $parameters['user_id'] = "$user";
    try {
        $statuses = $client->get('statuses/user_timeline', $parameters);
    } catch (HttpException $e) {
        echo "\nException in user: $user - " . $e->getMessage() . "\n";
    } catch (CURLException $e) {
        echo "\ncURL communication failed.\n";
        print_r($e);
    }

    foreach($statuses as $status){
        stream_handler_function_all_db($status, true);
    }
    return count($statuses);
}
/**
 * fetch the users history from timeline
 */
function follow_history() {
    global $comments_min, $pref_lang, $tweets_limit_history;
    $to_date = date("Ymdhm", time());
    $from_date = date("Ymdhm", strtotime("-1 month", time())); //check from 1 year ago and foward
    $client = mylogin();
    $users2follow = load_users();
    foreach ($users2follow as $user) {
        try {  
            $statuses = getUserHistory($user, $pref_lang ?? "en", $from_date, $to_date, $tweets_limit_history);
        } catch (HttpException $e) {
            echo "\nException in user: $user - " . $e->getMessage() . "\n";
            print_r($e);
        } catch (CURLException $e) {
            echo "\ncURL communication failed.\n";
            print_r($e);
        }
        
        //sleep for 10 minutes and retry if error defined
        while(property_exists($statuses, 'error')){
            write_to_log("ERROR", $statuses->error->message);
            
            sleep_with_prompt(1000); 
            $statuses = getUserHistory($user, $pref_lang ?? "en", $from_date, $to_date, $tweets_limit_history);          
        }
        
        //if we fetching thread - act as stream
        foreach($statuses as $status){
            stream_handler_function_all_db($status);
        }
    }

    }

function get_followers_app($jsonfile) {
    if (!file_exists("$jsonfile")) {
        die("File \"$jsonfile\" does not exists");
    }
    
    $client = mylogin();
    
    echo "Read from \"$jsonfile\"";
    $list_of_users_to_follow = json_decode(file_get_contents($jsonfile));
    foreach ($list_of_users_to_follow as $target_screen_name) {
        echo "\nGet followers of $target_screen_name\n";
        $next_cursor = -1;
        $acc = array();
        do {
            if ($next_cursor >= 0) {
                sleep(61) === 0 or die("\nCould not go to sleep\n");
            } //prevent Rate limit exceeding Exception (one request per minute)
            
            try {
                $collection = $client->get('followers/ids.json', [
                    'screen_name' => "$target_screen_name",
                    'count' => '5000',
                    'stringify_ids' => 'true',
                    'cursor' => "$next_cursor"
                ]);
                
                $acc = array_merge($acc, $collection->ids);
                $next_cursor = $collection->next_cursor_str;
                echo "\n$next_cursor";
            } catch (HttpException $e) {
                // cURL communication successful but something went wrong with Twitter APIs.
                $message = $e->getMessage();    // Message
                $code = $e->getCode();       // Error code (-1 if not available)
                $status = $e->getStatusCode(); // HTTP status code
                echo "\nmessage:$message";
                echo "\ncode:$code";
                echo "\nstatus:$status";
                break;
            } catch (Exception $e) {
                echo "\nCommunication stopped unexpectedly.\n";
                print_r($e);
                break;
            }
        } while ($next_cursor != 0);
        
        if (FALSE === file_put_contents("list-of-followers-id-$target_screen_name.json", json_encode($acc, true))) {
            die("can't save file \"list-of-followers-id-$target_screen_name.json\"");
        }
    }
}

//should not choose the same folder the crawler use
function select_current_folder() {
    global $cli_JSON_BASE_FOLDER;
    $subfolders = scandir($cli_JSON_BASE_FOLDER);
    if (FALSE === $subfolders || count($subfolders) <= 3) {
        sleep_with_prompt(1000); //the filter is ahead of the crawler so wait a bit
        return NULL;
    }
    
    return $subfolders[2]; //the first two folders are . and ..
}

function compare_json_field($field, $value) {
    if (empty($field)) {
        return false;
    }
    $normed = normalize_rule_string($field);
    return mb_eregi("$value", "$normed") > 0;
}


function is_according_to_rules($status, $rules) {
    $pass = array();
    foreach (get_object_vars($rules) as $field => $value) {
        if($field=='min_rules'  || $field=='mand_rules'){
            continue;
        }
        if (is_object($value)) {
            $pass = array_merge($pass, is_according_to_rules($status["$field"], $value));
        } else if (compare_json_field($status["$field"], $value)) {
            $pass[] = "$field";
        }
    }
    return $pass;
}

function filter_and_parse_handler($astatus, $subfolder, $pass, $sm_type) {
    global $metadataDir, $textDir; //parameters for parseMetaDataLine in parseTextAndMetadata.php
    global $cli_TMP_FOLDER;
    
//     $textDir = "$cli_TMP_FOLDER/text/$subfolder"; //this is where run_autosendfolder.bsh expects to find the folder
    $metadataDir = "$cli_TMP_FOLDER/text/$subfolder";
    $astatus['FullFolderPath'] = $sm_type;
    foreach ($pass as $value) {
        $astatus["filter_passed_rule_$value"] = "true";
    }
    parseMetaDataLine($astatus);
}

function is_below_quota($count) {// the variable $count starts from max and decrement until zero, so:
    //the left side of the inequality below starts from zero until it reaches the threshold that depends on the hour
    global $cli_filter_and_parse_app_QUOTA;
    return ($cli_filter_and_parse_app_QUOTA - $count) < $cli_filter_and_parse_app_QUOTA * ((date('H') + 1.0) / 24.0);
}

function do_once_per_tweet(&$connection, $json, $rules, $exclude_rules, $label) {
    global $filter_and_parse_counter, $cli_QUOTA_FILE, $cli_FILTER_NOT_DEDUP;
    if(!is_array($json)){
        $json = json_decode($json,true);
    }
    $passed_rules = passed_rules($json, $rules, $exclude_rules);
    

    $id_type_sm = getPostIdType($json);
    if(empty($id_type_sm)){
        return; //no data tweet
    }
    $id_sm = $id_type_sm["id_sm"];
    $sm_type = $id_type_sm["sm_type"];
    
    if (count($passed_rules) > 0|| is_below_quota($filter_and_parse_counter)){
        if($cli_FILTER_NOT_DEDUP ||  tryInsertUnique($connection, $sm_type, $id_sm)) {
            write_to_log("TRACE", "the tweet passed the filter: ".$id_sm);
            write_to_log("TRACE", "the tweet passed for: ".json_encode($passed_rules,true));
            
            filter_and_parse_handler($json, $label, $passed_rules, $sm_type);
            --$filter_and_parse_counter;
            file_put_contents($cli_QUOTA_FILE, date("z") . " $filter_and_parse_counter");
            
        }
    }
        
        
}

function getPostIdType($json){
    //webhose
    if(array_key_exists('uuid', $json)){
        return array("sm_type" => "webhose", 
            "id_sm" => $json["uuid"]);
    }
    elseif(array_key_exists('facebook_page', $json)){
        return array("sm_type" => "facebook",
            "id_sm" => $json["t_id"]);
    }
    elseif(array_key_exists('channel', $json)){
        return array("sm_type" => "telegram",
            "id_sm" => $json["id_str"]);
    }
    //twitter V2
    elseif(array_key_exists('data', $json)){
        return array("sm_type" => "Twitter",
            "id_sm" => $json['data']['id']);
    }
    elseif(array_key_exists('id_str', $json)){
        return array("sm_type" => isset($json['type']) ?  $json['type'] : "Twitter",
            "id_sm" => $json["id_str"]);
    }
    return null;
    
}

//passed_rules($json, $rules, $exclude_rules)
// checks that the json pass the rules and don't fail on exclude rules
function passed_rules($json, $rules, $exclude_rules){
    global $userAccountsTables;
    $min_rules = 0;
    if(isset($rules->min_rules)){
        $min_rules = $rules->min_rules;
    }
    if($min_rules==0){
        $pass[] = "zero_rules";
        return $pass;
    }
    if(isset($rules->mand_rules)){
        $mand_rules = $rules->mand_rules;
    }
    
    $pass = is_according_to_rules($json, $rules);
    
    //if userAccountTables is set - add this to the pass value
    if(isset($userAccountsTables)){
        foreach ($userAccountsTables as $table) {
            if (isUserExistsInTable($connection, "Twitter", "accountId", $json['id_str'], $table)) {
                $pass[] = $table;
            }
        }
    }
    
    $exclude_bool = false;
    $mand_bool = false;
    if($exclude_rules !== false){
        //fetch the matches for the excluding rules
        $exclude_pass = is_according_to_rules($json, $exclude_rules);
        //go over the excluding rules that matches to $exclude_pass. if one exists and $pass doesn't has it - the json will be filter out
        //mark the exclude_bool as true
        foreach($exclude_pass as $exclude_element){
            if(!in_array($exclude_element,$pass)){
                $exclude_bool = true;
                break;
            }
        }
    }
    
    //check mand_rules
    if(isset($mand_rules)){
        foreach($mand_rules as $mand_element){
            if(!in_array($mand_element,$pass)){
                $mand_bool = true;
                write_to_log("INFO", "the tweet didnt had the mandatory rule: ".$mand_element);
                
                break;
            }
        }
    }
    
    write_to_log("TRACE", "the tweet pass rules: ".json_encode($pass));
    write_to_log("TRACE", "the tweet exclude pass rules: ".json_encode($exclude_pass));
    
    if((count($pass) >= $min_rules) && !$exclude_bool && !$mand_bool){
        return $pass;
    }
    return [];
}


function filter_selected_subfolder($parent, $rules, $exclude_rules, $label) {
    global $cli_JSON_BASE_FOLDER;
    $folders = scandir("$cli_JSON_BASE_FOLDER/$parent");
    $connection = NULL;
    foreach ($folders as $fld) {
        if ("$fld" == "." || "$fld" == "..") {
            continue;
        } else if (is_dir("$cli_JSON_BASE_FOLDER/$parent/$fld")) {
            filter_selected_subfolder("$parent/$fld", $rules, $exclude_rules, $label);
            continue;
        }
        $json = json_decode(file_get_contents("$cli_JSON_BASE_FOLDER/$parent/$fld"), true); //convert to associative array
        do_once_per_tweet($connection, $json, $rules, $exclude_rules, $label); //set connection if null
    }
    if ($connection) {
        sqlClose($connection, 'cli_filter');
    }
}
function checkIsRunning($mysql){
    $sql_run = "SELECT isRunning FROM filter_settings";
    $res = sqlQuery($mysql, $sql_run);
    $obj = $res->fetch_object();
    if(empty($obj->isRunning)){
        die("stopped by user");
    }
}

function filter_selected_subfolder_new($rules, $exclude_rules, $label, $mysqli){
    $mysqli = NULL;
    createCrawlerConnection($mysqli);
    
    $mysqli_db = sqlCreateConnection("filter_selected_subfolder_db - get timeout");    
    $sleep_timeout = intval(getSystemSettingsProp($mysqli_db, 'crawler_timeout'));
    
    //beginning of iteration
    while (get_filter_subfolder() == $label) {
        
        checkIsRunning($mysqli_db);
        //get the max id from the raw_stream_data
        $max_raw_id = get_max_stream_id($mysqli);
        $last_fetched = get_last_filter_id_new($mysqli);
    
        //if the $max_raw_id is smaller than the $max_file_filter_id
        if ($max_raw_id <= $last_fetched) {
            write_to_log("INFO", "max id of raw_stream_data is smaller than filter_id - stopping");
            echo "max id of raw_stream_data is smaller than filter_id - waiting $sleep_timeout\n";
            sleep_with_prompt($sleep_timeout, $mysqli_db);//must be over 10m b/c of get_filter_subfolder()
            continue;
        }
        
        $sql = "UPDATE filter_file_id SET filter_id = filter_id + 1";
        $filter_id_stats = sqlQuery($mysqli, $sql);
        if (!$filter_id_stats) {
            write_to_log("WARNING", "UPDATE filter_file_id - can't update. trying again.");
            continue;
        }
        
        $last_fetched++;
        
        $query = "SELECT data FROM raw_stream_data WHERE id='$last_fetched' ";
        $raw_stream_data_stats = sqlQuery($mysqli, $query);
        if (!$raw_stream_data_stats) {
            write_to_log("ERROR", "SELECT raw_stream_data - issue - with tweet '$last_fetched'");
            continue;
        }
        $json_element = $raw_stream_data_stats->fetch_object();
        if(!is_object($json_element)){
            continue;
        }
        
        //dont use the utf8_encode unless you see and issue - it defect the regular flows.
        //$json = json_decode(utf8_encode($json_element->data), JSON_UNESCAPED_UNICODE);
        $json = json_decode($json_element->data, true);
        
        
        do_once_per_tweet($mysqli, $json, $rules, $exclude_rules, $label); //set connection if null
    }
    //End of iteration
    sqlClose($mysqli_db, "filter_selected_subfolder_db close");
    
    sqlClose($mysqli, 'cli_filter');
}

function filter_selected_subfolder_db($rules, $exclude_rules, $label) {
    global $ivWorkerName, $stop_every;
    $mysqli = NULL;
    $streamConn = NULL;
    createCrawlerConnection($mysqli);
    
    createStreamConnection($streamConn);
    $mysqli_db = sqlCreateConnection("filter_selected_subfolder_db - get timeout");
    $sleep_timeout = intval(getSystemSettingsProp($mysqli_db, 'crawler_timeout'));
    sqlClose($mysqli_db, "filter_selected_subfolder_db close");
    //beginning of iteration
    while (get_filter_subfolder() == $label) {
        
        //get the max id from the raw_stream_data
        $max_raw_id = get_max_stream_id($streamConn);
        $last_fetched = get_last_filter_id($mysqli);
        
        //for large amount of fetched files - so the intuscan wont have large queue
        if($stop_every && ($last_fetched % $stop_every) == 0 && $last_fetched > 0){
            return "stop";         
        }
        
        //if the $max_raw_id is smaller than the $max_file_filter_id
        if ($max_raw_id <= $last_fetched) {
            write_to_log("INFO", "max id of raw_stream_data is smaller than filter_id - stopping");
            echo "max id of raw_stream_data is smaller than filter_id - waiting $sleep_timeout\n";
            sleep_with_prompt($sleep_timeout);//must be over 10m b/c of get_filter_subfolder()
            continue;
        }
        
        $sql = "UPDATE filter_file_id SET filter_id = filter_id + 1  WHERE filter_name='$ivWorkerName'";
        $filter_id_stats = sqlQuery($mysqli, $sql);
        if (!$filter_id_stats) {
            write_to_log("WARNING", "UPDATE filter_file_id - can't update. trying again.");
            continue;
        }
        
        $last_fetched++;
        
        $query = "SELECT data FROM raw_stream_data WHERE id='$last_fetched' ";
        $raw_stream_data_stats = sqlQuery($streamConn, $query);
        if (!$raw_stream_data_stats) {
            write_to_log("ERROR", "SELECT raw_stream_data - issue - with tweet '$last_fetched'");
            continue;
        }
        $json_element = $raw_stream_data_stats->fetch_object();
        if(!is_object($json_element)){
            continue;
        }
        
        //dont use the utf8_encode unless you see and issue - it defect the regular flows.
        //$json = json_decode(utf8_encode($json_element->data), JSON_UNESCAPED_UNICODE);
        $json = json_decode($json_element->data, true);
        
        
        do_once_per_tweet($mysqli, $json, $rules, $exclude_rules, $label); //set connection if null
    }
    //End of iteration
    sqlClose($mysqli, 'cli_filter');
    sqlClose($streamConn, 'cli_filter');
}

function get_last_filter_id($mysqli) : int {
    global $ivWorkerName;
    $filter_query = "SELECT filter_id as max_filter FROM filter_file_id WHERE filter_name=\"$ivWorkerName\"";
    $filter_stats = sqlQuery($mysqli, $filter_query);
    $filter_obj = $filter_stats->fetch_object();
    if(empty($filter_obj)){
        return 0;
    }
    return is_null($filter_obj->max_filter) ? 0 : $filter_obj->max_filter;
}

function get_last_filter_id_new($mysqli) : int {
    global $ivWorkerName;
    $filter_query = "SELECT filter_id as max_filter FROM filter_file_id";
    $filter_stats = sqlQuery($mysqli, $filter_query);
    $filter_obj = $filter_stats->fetch_object();
    if(empty($filter_obj)){
        return 0;
    }
    return is_null($filter_obj->max_filter) ? 0 : $filter_obj->max_filter;
}

function get_max_stream_id($mysqli) {
    $max_raw_query = "SELECT max(id) as max_raw FROM raw_stream_data";
    $max_raw_stats = sqlQuery($mysqli, $max_raw_query);
    $raw_obj = $max_raw_stats->fetch_object();
    return is_null($raw_obj->max_raw) ? 0 : intval($raw_obj->max_raw);
}

function load_quota_from_file() {
    global $cli_QUOTA_FILE, $cli_filter_and_parse_app_QUOTA;
    
    $quota = FALSE;
    if (file_exists($cli_QUOTA_FILE)) {
        $quota = file_get_contents($cli_QUOTA_FILE);
    }
    if ($quota === FALSE) {
        return $cli_filter_and_parse_app_QUOTA;
    }
    
    $aquota = explode(" ", $quota);
    return ($aquota[0] == date("z")) ? $aquota[1] : $cli_filter_and_parse_app_QUOTA;
}

function filter_and_parse_app_inits($jsonfile, $parent_folder) {
    global $cli_filter_last_folder, $cli_JSON_BASE_FOLDER, $cli_JSON_DONE_FOLDER;
    global $filter_and_parse_counter;
    $cli_filter_last_folder = "";
    
    if (!file_exists("$jsonfile")) {
        die("File \"$jsonfile\" does not exists!");
    }
    if (!is_null($parent_folder) && !is_dir("$cli_JSON_BASE_FOLDER/$parent_folder")) {
        die("Source folder does not exists!");
    }
    if (!is_dir($cli_JSON_DONE_FOLDER)) {
        mkdir($cli_JSON_DONE_FOLDER, 0777, true);
    }
    $filter_and_parse_counter = load_quota_from_file();
}

function filter_and_parse_app_inits_db($jsonfile) {
    global $cli_filter_last_folder, $cli_JSON_BASE_FOLDER, $ivWorkerName;
    global $filter_and_parse_counter;
    createCrawlerConnection($mysqli);
    
    if (!file_exists("$jsonfile")) {
        die("File \"$jsonfile\" does not exists!");
    }
    
    $filter_and_parse_counter = load_quota_from_file();
    
    $query = " SELECT EXISTS(SELECT * FROM filter_file_id WHERE filter_name='$ivWorkerName') AS count";
    $stats = sqlQuery($mysqli, $query);
    $obj = $stats->fetch_object();
    $count = $obj->count;
    if ($count == 0) {
        $query = "INSERT INTO filter_file_id (filter_name, filter_id) VALUES('$ivWorkerName', 0)"; //initialize the filter tracker to start from zero
        $stats = sqlQuery($mysqli, $query);
    }
}

//normalization for string comparison in tweet rules.
//do not change case since it will tamper with regex
function normalize_rule_string($arules) {
    //remove tatweel - "ـ"
    //remove vowels: ّ  َ  ً  ُ  ٌ  ِ   ٍ  ْ
    //replace alif with hamza: أ آ إ
    //with regular alif: ا
    $brules = str_replace(['ـ', 'ّ', 'َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ'], "", $arules);
    $rules = str_replace(['أ', 'آ', 'إ'], 'ا', $brules);
    return $rules;
}

//Normalize the object passed by reference
function normalize_rules(&$rules) {
    foreach (get_object_vars($rules) as $field => $value) {
        if (is_object($value)) {
            normalize_rules($rules->$field);
        } else {
            $rules->$field = normalize_rule_string($value);
        }
    }
}


function load_rules_from_string($data, $type) {
    $rules = (object)($data[$type]);
    
    //inject user_locations if exists
    $user_obj = $rules->user;
    $b = $user_obj["location"];
    if(!empty($user_obj["location"])){
        $rules->user->location = $rules->user->location== "*" ?  user_obj["location"] : "(^|\W)($user_obj[location])($|\W)";
    }
    
    normalize_rules($rules);
    return $rules;
}


function load_rules_from_file($file) {
    echo "Read from \"$file\"\n";
    $jsonfile = file_get_contents($file);
    $rules = json_decode($jsonfile);
    is_null($rules) and die("Unable to parse json!");
    //inject user_locations if exists
    $userlocs = "";
    $loc_file = str_replace("json", "user-location.txt", $file);
    if (file_exists($loc_file)) {
        $userlocs = file($loc_file);
    }
    if (!empty($userlocs)) {
        $userlocstr = str_replace(["\r", "\n"], "", implode("|", $userlocs)); //get rid of \n\r
    } else {
        $userlocstr = "";
    }
    
    if (!empty($userlocstr) && !property_exists($rules, "user")) {
        $rules->user = new stdClass();
    }
    if (!empty($userlocstr) && property_exists($rules, "user") && property_exists($rules->user, "location")) {
        $rules->user->location = $rules->user->location . "|(^|\W)($userlocstr)($|\W)";
    } elseif(!empty($userlocstr)) {
        $rules->user->location = $userlocstr== "*" ?  $userlocstr : "(^|\W)($userlocstr)($|\W)";
    }
    normalize_rules($rules);
    return $rules;
}

//Should not be called directly! Pls. use run_filterandparse.bsh SINK_FILE
//this function selectively call parseMetaDataLine() on tweets and delete the rest
function filter_and_parse_app($jsonfile, $exclude_json, $parent_folder) {
    global $cli_JSON_BASE_FOLDER, $cli_JSON_DONE_FOLDER;
    filter_and_parse_app_inits($jsonfile, $parent_folder);
    $rules = load_rules_from_file($jsonfile);
    $exclude_rules = load_rules_from_file($exclude_json);
    if (is_null($parent_folder)) {
        $parent_folder = select_current_folder();
    }
    
    while (true) {
        if (!is_null($parent_folder)) {
            filter_selected_subfolder($parent_folder, $rules, $exclude_rules, $parent_folder); //parseMetaDataLine
            upload_folder(0); //run_autosendfolder on the texts
            echo "move $parent_folder to done-json/$parent_folder\n";
            $moved = rename("$cli_JSON_BASE_FOLDER/$parent_folder", "$cli_JSON_DONE_FOLDER/$parent_folder"); //mark folder as done
            if (!$moved) {
                die("Move failed\n");
            }
        }
        $parent_folder = select_current_folder(); //for next iteration
    }
    die(1); //("no more folders to filter in $cli_JSON_BASE_FOLDER\n");
}

function get_filter_subfolder() : string
{
    global $ivWorkerName;
    return $ivWorkerName . substr(date('y-m-d-H-i'), 0, -1); //change folder every ten minutes
}

//checks if the jsonfile exists - for filter or stream
function check_def_json($json_path){
    global $projectConfigFolder;
    global $ivWorkerName;
    $jsonfile = "$projectConfigFolder/resources/$json_path"."_$ivWorkerName.json";
    
    if (!file_exists("$jsonfile")) {
        $error = "File \"$jsonfile\" does not exists";
        write_to_log("ERROR", $error);
        echo "Didnt find the Json file - ".$jsonfile;
        die($error);
    }
    echo "Found the Json file for the defenitions - ".$jsonfile."\n";
    return $jsonfile;
}

//Should not be called directly! Pls. use run_filterandparse.bsh SINK_FILE
//this function selectively call parseMetaDataLine() on tweets and delete the rest
function filter_and_parse_app_db() {
    global $ivWorkerName;
    //check if the json file exists (based on the ivWorkerName)
    $jsonfile = check_def_json("filter_and_parse-rules");
    
    $mysqli = null;
    createCrawlerConnection($mysqli);
    
    filter_and_parse_app_inits_db($jsonfile);
    //fetch the exclude rules
    $exclude_path = str_replace("rules_$ivWorkerName.json", "exclude-rules_$ivWorkerName.json", $jsonfile);
    
    while (true) {
        //reads the rules
        $rules = load_rules_from_file($jsonfile);
        write_to_log("TRACE", "RULES ARE:" . json_encode($rules));
        
        if(file_exists($exclude_path)){
            $exclude_rules = load_rules_from_file($exclude_path);
            write_to_log("TRACE", "EXCLUDE RULES ARE:" . json_encode($exclude_rules));
        }
        else{
            $exclude_rules = false;
        }
        
        //get the last id from filter_file_id
        $subfolder = get_filter_subfolder();
        $res = filter_selected_subfolder_db($rules, $exclude_rules, $subfolder); //parseMetaDataLine
       
        
        upload_folder(0); //run_autosendfolder on the texts
  
        //TODO: consider deleting the filtered rows from $streamConn
    }
}


function filter_and_parse_app_db_ui() {
    $conn = sqlCreateConnection("filter_and_parse_app_db_new");
    $rules = 0;//getCrawlerSettingsProp($conn, "filter");
    sqlQuery($conn, "UPDATE filter_settings SET isRunning=1 WHERE prop='filter'");
    
    //reads the rules
    $include_rules = new stdClass();//load_rules_from_string($rules, "include");
    $include_rules->min_rules = 0;
    $exclude_rules = new stdClass(); //load_rules_from_string($rules, "exclude");
        
    $mysqli = null;
    createCrawlerConnection($mysqli);
    
    $query = " SELECT EXISTS(SELECT * FROM filter_file_id WHERE filter_name='1') AS count";
    $stats = sqlQuery($mysqli, $query);
    $obj = $stats->fetch_object();
    $count = $obj->count;
    if ($count == 0) {
        $query = "INSERT INTO filter_file_id (filter_name, filter_id) VALUES('1', 0)"; //initialize the filter tracker to start from zero
        $stats = sqlQuery($mysqli, $query);
    }
    
    while (true) {  
        //get the last id from filter_file_id
        $subfolder = get_filter_subfolder();
        $res = filter_selected_subfolder_new($include_rules, $exclude_rules, $subfolder, $mysqli); //parseMetaDataLine
        upload_folder(0); //run_autosendfolder on the texts      
    }
}


function webHose_Action($source_file, $days = 1){
    require_once '..'.DIRECTORY_SEPARATOR.'webHose'.DIRECTORY_SEPARATOR.'webhose.php';
    require_once '..'.DIRECTORY_SEPARATOR.'webHose'.DIRECTORY_SEPARATOR.'webHoseApp.php';

    global $accessToken;
    
    $since = strtotime(" -$days day");
    
    Webhose::config($accessToken);
    
    $full_urls = fetch_states_urls($source_file, $since);
  
    foreach($full_urls as $state => $urls){
        foreach ($urls as $url){
            convertUrlToPosts($url, $state);
        }
    
    }
    
}

function getDumpOutFilename($seq = 0) {
    global $projectLocalFolder, $ivProjectName;
	static $last = 0;
	$seq = ($last>0) ? $last : $seq;
    $outf = "$projectLocalFolder/dump-$ivProjectName/dump$seq*";
	
	while (count(glob($outf))) {
        ++$seq;
        $outf = "$projectLocalFolder/dump-$ivProjectName/dump$seq*";
    }
	$last = $seq;
    return str_replace('*', '.log', $outf);
}

function dump_one_json($outf, $post_id, $streamConn) {
    $sql = "SELECT data FROM raw_stream_data WHERE element_id = '$post_id'";
    $res = sqlQuery($streamConn, $sql);
    if ($res && $res->num_rows) {
        while ($obj = $res->fetch_object()) {
            file_put_contents("$outf", "$obj->data\n", FILE_APPEND);
        }
    } else {
		echo "Error reading from streamDB\n";
	}
    
}

function dump_many_json($outf, $post_id, $streamConn, $limit) {
    $sql = "SELECT data FROM raw_stream_data ";
    if (is_array($post_id)) {
        $sql .= "WHERE element_id IN ($post_id)";
    } elseif (is_numeric($limit)) {
        $sql .= "LIMIT 10000 OFFSET $limit";
    }

    $res = sqlQuery($streamConn, $sql);
    if ($res && $res->num_rows) {
        $dump = "";
        while ($obj = $res->fetch_object()) {
            $dump .= "$obj->data\n";
        }
        file_put_contents("$outf", $dump);
    } elseif ($res === FALSE) {
        echo "Error reading from streamDB\n";
    }
    return $res ? ($res->num_rows) : null;
}

function dump_filtered_raw_tweets($minid, $mysqli, $streamConn) {
    $sql = "SELECT post_id FROM parsed_post_id WHERE post_type='Twitter' AND post_id > '$minid' ORDER BY post_id limit 10000";
    $parsed_post_id = sqlQuery($mysqli, $sql);
    echo "$sql\nQuery returned " . ($parsed_post_id !== FALSE ? "$parsed_post_id->num_rows results\n" : "ERROR\n");
    $outf = getDumpOutFilename();
    is_dir(dirname($outf)) || mkdir(dirname($outf), 0777, true);
    $last_id = null;
    if ($parsed_post_id && $parsed_post_id->num_rows > 0) {
        $ids = array();
        while ($obj = $parsed_post_id->fetch_object()) {
            $last_id = $obj->post_id;
            //dump_one_json($outf, $last_id, $streamConn);
            $ids[] = $obj->post_id;
        }
        $allids = implode(",", $ids);
        dump_many_json($outf, $allids, $streamConn, false);
    } elseif ($parsed_post_id === FALSE) {
        echo "Error reading from crawlerDB\n";
    }
    return $last_id; //will return null when $minid is max(post_id)
}

function dump_limited_raw_tweets($minid, $streamConn) {
    $outf = getDumpOutFilename();
    is_dir(dirname($outf)) || mkdir(dirname($outf), 0777, true);
    $last_id = dump_many_json($outf, null, $streamConn, $minid);
    return $last_id; //will return null when $minid is max(post_id)
}

//iterate over all filtered tweets in raw_stream_data and dump then in chunks of 1000
function dump_all_filtered_raw_tweets($minid = 0) {
    $mysqli = null;
    $streamConn = null;
    createCrawlerConnection($mysqli);
    createStreamConnection($streamConn);
    do {
        $minid = dump_filtered_raw_tweets($minid, $mysqli, $streamConn);
    } while (isset($minid));
    sqlClose($streamConn, 'cli_filter');
    sqlClose($mysqli, 'cli_filter');
}

//dump all tweets in raw_stream_data
function dump_all_raw_tweets($minid = 0) {
    $streamConn = null;
    createStreamConnection($streamConn);
    $inc = 0;
    do {
        $minid += $inc;
        $inc = dump_limited_raw_tweets($minid, $streamConn);
    } while (isset($inc) && $inc > 0);
    sqlClose($streamConn, 'cli_filter');
}


/**
 * getRetweets - runs on exsited raw_stream_data table and fetch the retweets and quates - and add them to the raw_stream_data
 */
function getRetweets(){
    createCrawlerConnection($mysqli_crawler);
    $sql = "SELECT data from raw_stream_data";
    $res = sqlQuery($mysqli_crawler,$sql);
    while($obj = $res->fetch_object()){
        $tweet = json_decode($obj->data);
        if(!empty($tweet->retweeted_status)){
            $retweet = $tweet->retweeted_status;
            
            $status_in_json = json_encode($retweet);
            keep_json_db($retweet, $status_in_json);
            
        }
        if(!empty($tweet->quoted_status)){
            $quatedTweet = $tweet->quoted_status;
            
            $status_in_json = json_encode($quatedTweet);
            keep_json_db($quatedTweet, $status_in_json);
            
        }
    }
}

/**
 * fetches tweeter data with treads (RT and qoutes). 
 * @param unknown $mysqli
 * @param string $username
 * @param unknown $limit_children - the limit of children we place - the RT or qoutes of the parent
 * @param unknown $label - label for the SavedSearchID (ssid uses only for our tags)
 * @param unknown $dir_path - where to export those jsons
 */
function fetchThreadDataTwitter($mysqli, $username = 'admin', $limit_children, $label, $dir_path, $facebook = null){
    global $projectLocalFolder;
    
    $lang_table = "languages";//$facebook ? "languages" : "lang"; 
    $lang_value = "arabic";//$facebook ? "arabic" : "ar";
    $crawlerMysql= null;
    createCrawlerConnection($crawlerMysql);
    $count = 0;
    $total_items = 0;
    /**
     * creates the output folder if not exists
     */
    write_to_log("TRACE", "Checking SSID");
    
    if (!is_dir($dir_path)) { 
         mkdir_full($dir_path, 0777, true); 
    }
  
    
    //create array of allowed users
    $parents_id_arr = [];       
    $row = 1;

    /**
     * gets the ssid of this label
     */
    //check if ssid exists
    write_to_log("TRACE", "Checking SSID");
    
    $res_ssid = sqlQuery($mysqli,"SELECT id FROM savedsearch WHERE  name ='".$label."'");
    $obj_ssid = $res_ssid->fetch_object();
    
    $ssid = $obj_ssid->id;
    
    if(empty($ssid)){
        //insert to savedsearchid and get the saved search id
        $create_search = "INSERT INTO savedsearch  (name,username) VALUES ('$label','$username')  ON DUPLICATE KEY UPDATE id=id";
        
        sqlQuery($mysqli,$create_search);
        $res_ssid = sqlQuery($mysqli,"SELECT id FROM savedsearch WHERE  name ='".$label."'");
        $obj_ssid = $res_ssid->fetch_object();
        
        $ssid = $obj_ssid->id;
    }
    
    $res_ssid = sqlQuery($mysqli,"SELECT id FROM savedsearch WHERE  name ='no_tag'");
    $obj_ssid = $res_ssid->fetch_object();
    
    $no_tag_ssid = $obj_ssid->id; 
     
    if(empty($no_tag_ssid)){
        //insert to savedsearchid and get the saved search id
        $create_search = "INSERT INTO savedsearch  (name,username) VALUES ('no_tag','$username')  ON DUPLICATE KEY UPDATE id=id";
        
        sqlQuery($mysqli,$create_search);
        $res_ssid = sqlQuery($mysqli,"SELECT id FROM savedsearch WHERE  name ='no_tag'");
        $obj_ssid = $res_ssid->fetch_object();  
         
        $no_tag_ssid = $obj_ssid->id;
    }
    
    //deletes the savedsearch of that label
    write_to_log("TRACE", "Working with SSID $ssid - deleteing results from before");
    
    $res_ssid = sqlQuery($mysqli,"DELETE FROM savedsearchid WHERE id=$ssid");

    $res_no_tag_ssid = sqlQuery($mysqli,"DELETE FROM savedsearchid WHERE id=$no_tag_ssid");
    
    /**
     * fetches the hirarchy of parents and sons from the t_id, t_parent_id
     */

    //gets all the tweets (from t_id) that has sub-tweets (retweet or qoute) - get it by parent_id & and the main topic of the parent
    $sql = "SELECT 
                t_id.docid AS parent,
                t_id.value AS parent_str, 
                COUNT(DISTINCT t_parent_id.docid) AS c
            FROM 
                t_id,t_parent_id
            WHERE
                t_parent_id.value = t_id.value
            AND
                t_parent_id.docid IN (SELECT docid FROM $lang_table WHERE $lang_table.value=\"$lang_value\") 
       
            GROUP BY 
                parent  
            HAVING 
                c > $limit_children";
//     $sql = "SELECT
//         t_id.docid AS parent,
//         t_id.value AS parent_str,
//         COUNT(DISTINCT t_parent_id.docid) AS c
//         FROM
//         t_id,t_parent_id
//         WHERE
//         t_parent_id.value = t_id.value
//         AND
//         t_parent_id.docid IN (SELECT docid FROM $lang_table WHERE $lang_table.value=\"$lang_value\")
//         AND
//         t_id.docid IN (SELECT docid FROM $lang_table WHERE $lang_table.value=\"$lang_value\")
//         GROUP BY
//         parent
//         HAVING
//         c > $limit_children";
    
    $res = sqlQuery($mysqli, $sql);
    /**
     * iterates over the parents (first layer)
     */
    $no_tag = 0;
    
    if ($res && $res->num_rows){
        while($obj = $res->fetch_object()){
            
            $count++;
            if($count % 50 == 0){
                write_to_log("TRACE", "Passed $count of the parents");
                echo "\nPassed $count of the parents \n"; 
            }
            $total_topics = 0;
            
            //fetch parent info
            $parent = $obj->parent; 
            $parent_str =  $obj->parent_str;
            
        
            
            $sql_kids = "SELECT count(DISTINCT  docid) AS c from t_parent_id WHERE value = $parent_str";
            $res_kids = sqlQuery($mysqli, $sql_kids);
            $obj_kids = $res_kids->fetch_object();
            $kids_count = $obj_kids->c;
      
            if($kids_count < $limit_children){
                write_to_log("TRACE", "parent has less than $limit_children children - skip");
                continue;
            }
            
            else{
                write_to_log("TRACE", "Started Working on parent $parent: found $obj_kids->c children");
                
            }
            
            //insert parent to savedsearchid
            $sql = "INSERT INTO savedsearchid (id,docid) VALUES($ssid,$parent)   ON DUPLICATE KEY UPDATE id=id";
            sqlQuery($mysqli, $sql); 
            
          
            
            //fetch parent raw_data & topics
            write_to_log("TRACE", "Fetching parent raw data and topic");
            $sql = " SELECT * from raw_stream_data where element_id='$parent_str'";
            $raw_parent_res = sqlQuery($crawlerMysql, $sql);
            $raw_parent_obj = $raw_parent_res->fetch_object();
            $parent_final['raw_data'] = json_decode(($raw_parent_obj->data), true);
            
            $parent_text = $parent_final['raw_data']['text'];
            if(empty($parent_text)){
                write_to_log("TRACE", "empty parent");    
            }
             
            
            //fetches parent intuview data (entities, sentiments..)
            write_to_log("TRACE", "Fetching parent intuview-data");
            $intuviewInfo = fetchIntuviewData($mysqli, $parent, null, $no_tag_ssid);
            if(!empty($intuviewInfo["topic"][0])){
                $total_topics++;
            }
            $name = $parent_final['raw_data']['user']['name'] ?? $parent_final['raw_data']['user_name'] ?? null;
            $location = $parent_final['raw_data']['user']['location'] ?? null;
            $description = $parent_final['raw_data']['user']['description'] ?? null;
            
            $extra_data[] = array("id_str"=> $parent_str, "location"=>$location, "desc"=>$description, "name" => $name);
            
            
            $parent_final['intuview_info'] = $intuviewInfo;
            
            //attach the tag/topic to the parent json         
            $children_res = [];
            $children_count = 0; 
            
            /**
             * iterates over the children (2th layer)
             */
            $sql_kids = "SELECT docid from t_parent_id WHERE value = $parent_str";
            $res_kids = sqlQuery($mysqli, $sql_kids);
            $comments_with_text = 0;
            while ($row_kids = $res_kids->fetch_object()) {
                $child = $row_kids->docid;
                            
                $skip_ssid = false;
                write_to_log("TRACE", "Fetching child: $child. Count: $children_count out of $kids_count");
                $children_count++;
                
                $intuviewInfoChild = fetchIntuviewData($mysqli, $child, $intuviewInfo["topic"], $no_tag_ssid);
              
                if(!empty($intuviewInfoChild["topic"][0])){
                   $total_topics++;
                }
                //fetch the raw data of the $child
                $sql = "SELECT value from t_id where docid=$child";
                
                $child_res = sqlQuery($mysqli, $sql);
                $child_obj = $child_res->fetch_object();
                $child_str = $child_obj->value;
                $children_res[$child_str] = array();
                
                $sql = " SELECT data from raw_stream_data where element_id='$child_str'";
                $raw_child_res = sqlQuery($crawlerMysql, $sql);
                $raw_child_obj = $raw_child_res->fetch_object();
                $raw_child = json_decode($raw_child_obj->data,true); 
                
                $child_text = $raw_child['text'];
                if(empty($child_text)){
                    write_to_log("TRACE", "Empty Text for child");
                }
                /** 
                 * special patch - if child is RT without extra text - dont insert to SSID
                 * since it doesnt require tag assss
                 */   
      
            $children_res[$child_str]['raw_data'] = $raw_child;
            
            
            $children_res[$child_str]["intuview_info"] = $intuviewInfoChild;
            
            /**
             * insert child to SSID unless it is RT with same text
             */
            sqlQuery($mysqli, "INSERT INTO savedsearchid (id,docid) VALUES($ssid,$child)  ON DUPLICATE KEY UPDATE id=id");

            }
            $count_kids = count($children_res);
            
          
            //insert childs to parent "children" field
            $parent_final['intuview_info']["children_count"] = $count_kids;
            
            $parent_final["children"] = $children_res;
            
            $total_items += $comments_with_text+1;
            //write the full JSON to the FS
            $fp = fopen($dir_path. DIRECTORY_SEPARATOR. $parent_str.".json","wb");
            fwrite($fp,json_encode($parent_final,true));
            fclose($fp);
    
        }       
    }
    echo $total_items;
    
}




/**
 * fetchIntuviewData - for child thread it'll have default topic. for parent - if empty topic - skip
 * @param unknown $mysqli
 * @param unknown $docid
 * @param unknown $main_topic
 * @return NULL|array[]|unknown[][]
 */
function fetchIntuviewData($mysqli, $docid, $main_topic = null, $no_tag_ssid){
    if(empty($docid)){
        return null;
    }
    $intuview_info = [];
    $entities = array("person_object", "place_object", "organizational_identity", "abstract_idea_entity", "event");
    
    //creates tag info for document
    $tag_sql = "SELECT GROUP_CONCAT(DISTINCT NAME) AS tag FROM filetags WHERE docid=$docid AND NAME != 'Reviewed' AND NAME != 'upload_text'";
    $res = sqlQuery($mysqli, $tag_sql);
    if(!empty($res)){
        $objTag = $res->fetch_object();
        $objTagArr = explode(',', $objTag->topic);
    }
    $intuview_info["tag"] = $objTagArr ?? null;
    
    //gets all the tweets (from t_id) that has sub-tweets (retweet or qoute) - get it by parent_id & and the main topic of the parent
    $sql = "SELECT GROUP_CONCAT(DISTINCT topic.value) AS topic from topic WHERE docid=$docid";
    $res = sqlQuery($mysqli, $sql);
    if(!empty($res)){
        $objTopic = $res->fetch_object();
        $objTopicArr = explode(',', $objTopic->topic);
    }
    
    
    
    
    if(!empty($main_topic) ){
        if(empty($objTopicArr) || empty($objTopicArr[0])){
            $objTopicArr= [];
        }
      
        foreach($main_topic as $main_t){
                if(empty($objTopicArr)){
                    sqlQuery($mysqli, "INSERT INTO savedsearchid (id,docid) VALUES($no_tag_ssid,$docid)   ON DUPLICATE KEY UPDATE id=id");
                    
                }     
            if(!in_array($main_t, $objTopicArr)){
                $objTopicArr[] = $main_t;
            }
            
        }
    }
    //if not topic - we return null
    if(empty($objTopicArr) || empty($objTopicArr[0])){
        sqlQuery($mysqli, "INSERT INTO savedsearchid (id,docid) VALUES($no_tag_ssid,$docid)   ON DUPLICATE KEY UPDATE id=id");
    }
    $intuview_info["topic"] = $objTopicArr ?? null;
    
    
    
    foreach($entities as $entity){
        $obj_arr = [];
        $sql = "SELECT GROUP_CONCAT(DISTINCT $entity.name) AS entity_info from $entity WHERE docid=$docid";
        
        $res = sqlQuery($mysqli, $sql);
        if(!empty($res)){
            $obj = $res->fetch_object();
            $obj_arr = explode(',', $obj->entity_info);
        }
       
        $intuview_info["$entity"] = $obj_arr;
    }
    
    $sql = "SELECT pro, anti, e_concept AS emotion FROM emotion WHERE docid=$docid";
    
    $res = sqlQuery($mysqli, $sql);
    $obj = $res->fetch_object();
    if(!empty($obj)){
        $intuview_info["pro"] = $obj->pro;
        $intuview_info["anti"] = $obj->anti;
        $intuview_info["emotion"] = $obj->emotion;
    }
    else{
        $intuview_info["pro"] = "";
        $intuview_info["anti"] = "";
        $intuview_info["emotion"] = "";
    }
   
    
    return $intuview_info;
    
    
}

