<?php
/*
//sample values in config.php:
$mainIP = "192.168.10.154";
$sharedFolder = '//ivqa2/idf-river3';
//defaults for userCake		
$db_user="root";
$db_pass="intuview";
$db_name = "cnm_idf_river3";
$db_host="192.168.10.4";
$db_port="3307";
$isMemSQL = true;
$SOLR_HOME             = "D:/solr-5.4.1/bin";
$shouldDumpDbg = true;
$skipDuplicate = true; //important for the test!!!!
*/




//this folder shouldn't be shared by machines in cluster
if (empty($localProjectsFolder))
	$localProjectsFolder = "$CURRENT_SOURCE_FOLDER/ivLocalProjects";

//by default, project-specific configs would reside in ivLocalProjects folder, but it's wiser to separate it - so the configs could be shared among cluster machines
if (empty($projectsConfigFolder))
	$projectsConfigFolder = $localProjectsFolder;

//get project name from root folder name
if (empty($ivProjectName)){
    global $CURRENT_SOURCE_FOLDER;
	$ivProjectName = basename(dirname($CURRENT_SOURCE_FOLDER)); //this should give the folder above AllInOneWeb - which is the UI folder name
}
//project local folder
if (empty($projectLocalFolder))
	$projectLocalFolder = "$localProjectsFolder/$ivProjectName";

if (empty($projectConfigFolder))
	$projectConfigFolder = "$projectsConfigFolder/$ivProjectName";

//some defaults before including local config - which might overwrite them
//$useSolr = true;
//$isFolderService = true;

//include project-specific config file
if (file_exists("$projectConfigFolder/config.php"))
	require_once "$projectConfigFolder/config.php";

if (empty($mainIP))
    $mainIP = gethostbyname(gethostname());//"127.0.0.1";//gethostbyname(gethostname()); does not work well in WSL (Windows Subsystem for Linux)

//for cluster environment, where multiple crawlers work
if (empty($ivWorkerName))
{
  $ivWorkerName = getenv("IV_WORKER_NAME");
  if (empty($ivWorkerName))
	$ivWorkerName = gethostname();
}
//worker specific settings
if (!empty($ivWorkerName) && isset($ivWorkersSettings) && array_key_exists($ivWorkerName, $ivWorkersSettings))
{
	$logFNsuffix .= $ivWorkerName;
	if (array_key_exists("twitterCredentials", $ivWorkersSettings[$ivWorkerName]))
	{
		define("tweeterCK", $ivWorkersSettings[$ivWorkerName]["twitterCredentials"]["tweeterCK"]);
		define("tweeterCS", $ivWorkersSettings[$ivWorkerName]["twitterCredentials"]["tweeterCS"]);
		define("tweeterAT", $ivWorkersSettings[$ivWorkerName]["twitterCredentials"]["tweeterAT"]);
		define("tweeterAS", $ivWorkersSettings[$ivWorkerName]["twitterCredentials"]["tweeterAS"]);
	}
}

//by default, a cluster machine should have an "ivProjects" shared folder - under which should be all projects
if (!isset($ivProjectsDir))
	$ivProjectsDir = "ivProjects/";

if (empty($sharedFolder))
	$sharedFolder = "//$mainIP/$ivProjectsDir$ivProjectName";

	if (empty($db_name) || $db_name == 'cnm')
	    $db_name = str_replace("-", "_", $ivProjectName) . "_db";

if (empty($db_host))
	$db_host = $mainIP;

if (empty($SOLR_SERVER_HOSTNAME))
	$SOLR_SERVER_HOSTNAME  = $mainIP;

if (empty($textDataBase))
	$textDataBase = $db_name . "_text";

if (empty($text_db_host))
	$text_db_host = $db_host;

if (empty($uploadedFilesPath))
	$uploadedFilesPath = "$sharedFolder/docs/text";

	$UPLOADS_PATH = "$sharedFolder/uploads";
//$extraFieldsPath = "$sharedFolder/docs/extradata";

if (empty($metadata_path)){
    $metadata_path =  $CURRENT_SOURCE_FOLDER."/metadata";
}

$redactedDocsPath = "$sharedFolder/redacted_docs";
//$UI_FOLDER = "$sharedFolder/ui_components";


$limitResult = 10000000;

$separateTextQuery = true;

//a map between a db name and its host/user/pwd/db/port 
if (!isset($connectionOverrides))
	$connectionOverrides = [];

if (!array_key_exists($db_name, $connectionOverrides))
	$connectionOverrides[$db_name] = [
		"host" => $db_host,
		"user" => $db_user,
		"pwd"  => $db_pass,
		"db"   => $db_name,
		"port" => $db_port,
		"isMemSQL" => $isMemSQL
	];
if (!array_key_exists($textDataBase, $connectionOverrides))
	$connectionOverrides[$textDataBase] = [
		"host" => $text_db_host,
		"user" => $db_user,
		"pwd"  => $db_pass,
		"db"   => $textDataBase,
		"port" => NULL
	];
		/*
if (!array_key_exists("EnronRelativitySample", $connectionOverrides))
	$connectionOverrides["EnronRelativitySample"] = [
				"host" => "TOPLEX\SQLEXPRESS",
				"user" => "root",
				"pwd" => "intuview",
				"db" => "EDDS1015258",
				"port" => NULL
		];
		*/

//cralwer DB table will reside in the main DB by default
if (!array_key_exists("crawlerDB", $connectionOverrides))
{
	$connectionOverrides["crawlerDB"] = $connectionOverrides[$db_name];
	$connectionOverrides["crawlerDB"]["db"] .= "_crawler";
	$connectionOverrides["crawlerDB"]["isMemSQL"] = false;
}

//cralwer DB table will reside in the main DB by default
if (!array_key_exists("streamDB", $connectionOverrides))
{
	$connectionOverrides["streamDB"] = $connectionOverrides[$db_name];
	$connectionOverrides["streamDB"]["db"] .= "_stream";
	$connectionOverrides["streamDB"]["isMemSQL"] = false;
}

//for crawling twitter users
//if (!isset($userAccountsTables))
//	$userAccountsTables = ["egyptian_twitter"];


//$xml_results_path = $metadata_path . '/tmp_results'; 

//$extraColumns is only the first part, the second is tracxExtradata
$extraColumns = array(// map "table-name" to "Label in UI" 
		"lang" => "SM Language",
		"user_screen_name" =>"user screen name",
		"user_id_str" => "twitter_user_id",
        "twitter_place" => "place full name",
		"user_location" => "user location",
		"user_lang" => "user language",
		"user_url" => "user homepage",
		"place_full_name" => "Twitter Place",
		"place_country" => "Twitter Country",
		"tv_parent_author" => "Parent Author",
		"user_time_zone" => "User timezone",
		"place_country_code" => "Twitter country code",
         "grandparent_id" => "grandparent_id",
		//
		"quote_count" => "Quote Count",
		"reply_count" => "Reply Count",
		"retweet_count" => "Retweet Count",
		"favorite_count" => "Favorite Count",
		"retweeted_id" => "Retweeted id",
		"retweeted_quote_count" => "Retweeted Quote Count",
		"retweeted_reply_count" => "Retweeted Reply Count",
		"retweeted_retweet_count" => "Retweeted Retweet Count",
		"retweeted_favorite_count" => "Retweeted Favorite Count",
		"retweeted_user_screen_name" => "Retweeted Users",
		"t_hashtag" => "Hashtag",
		//
		"user_description" => "User Description",
		"user_translator_type" => "User Translator Type",
		"user_followers_count" => "Followers Count",
		"user_friends_count" => "User Friends Count",
		"user_listed_count" => "User Listed Count",
		"user_favourites_count" => "User Favourites Count",
		"user_statuses_count" => "User Statuses Count",
		"user_created_at" => "User Created At",
		"user_utc_offset" => "User Utc Offset",
		"user_geo_enabled" => "User Geo Enabled",
    
        "startdatetime" => "date_field",
        "finchpageid" => "t_id",
        "additionalcontext" => "additionalcontext",
    
    "tweet_likes" => "tweet_likes",
    "tweet_shares" => "tweet_shares",
    "tweet_user_url" => "tweet_user_url",
    "network" => "network",
        "photos" => "photos",

    "component"=>"component",
    "reporter" => "file_author",
    "priority"=>"Priority",
    "notice_number" => "notice_number",
    "order_number" => "order_number",
    "main_work_number" => "main_work_number",
    "functional_location" => "functional_location",
    
    //webhose
    "orig_key" => "place_country",
    
    //FB
    "t_username" => "user_name",
    "date_field" => "date",
    "fb_page"=>"facebook_page",
    "fb_sad"=>"sad",
    "fb_wow"=>"wow",
    "fb_happy"=>"haha",
    "fb_like"=>"like",
    "fb_love"=>"love",
    "fb_angry"=>"angry",
    "fb_no_comment"=>"no_comment",
    "fb_shared"=>"number_of_shared",
    
//     //linkedin
// //     "public_identifier" => "",
// //     "l_profile_pic_url" => "",
// //     "l_background_cover_image_url" => "",
//     "l_first_name" => "l_first_name",
//     "l_last_name" => "l_last_name",
//     "l_full_name" => "user_screen_name",
//     "l_occupation" => "l_occupation",
//     //     "l_headline" => "",
//     "l_summary" => "user_description",
//     "l_country" => "t_country",
//     //     "l_country_full_name" => "",
//     "l_city" => "l_city",
//     "l_state" => "l_state",
//     "l_experiences" => "l_experiences",
//     "l_experiences_companies" => "l_experiences_companies",
//     "l_experiences_titles" => "l_experiences_titles",
    
//     "l_education" => "l_education",
//     "l_education_degree_name" => "l_education_degree_name",
//     "l_education_school" => "l_education_school",
//     "l_education_field_of_study" => "l_education_field_of_study",
    
//     "l_languages" => "l_languages",
//     "l_accomplishment_organisations" => "l_accomplishment_organisations",
//     "l_accomplishment_publications" => "l_accomplishment_publications",
//     "l_accomplishment_honors_awards" => "l_accomplishment_honors_awards",
//     "l_accomplishment_patents" => "l_accomplishment_patents",
//     "l_accomplishment_courses" => "l_accomplishment_courses",
//     "l_accomplishment_projects" => "l_accomplishment_projects",
//     "l_accomplishment_test_scores" => "l_accomplishment_test_scores",
//     "l_volunteer_work" => "l_volunteer_work",
//     "l_certifications" => "l_certifications",
//     "l_connections" => "",
//     //     "l_people_also_viewed" => "",
//     "l_recommendations" => "l_recommendations",
//     "l_activities" => "l_activities",
//     //     "l_similarly_named_profiles" => "",
//     "l_articles" => "l_articles",
//     "l_groups" => "l_groups",
//     "l_skills" => "l_skills",
//     "l_inferred_salary" => "l_inferred_salary",
//     "l_gender" => "t_gender",
//     "l_birth_date" => "l_birth_date",
//     "l_industry" => "l_industry",
//     "l_interests" => "l_interests",
//     "l_extra" => "l_extra",
//     "l_personal_emails" => "l_personal_emails",
//     "l_personal_numbers" => "l_personal_numbers",
    
    //air
    "sug_sotz" => "sug_sotz",
    "mashov_miluli" => "mashov_miluli",
    "rama_irgunit" => "rama_irgunit",
    "teur_sheela" => "teur_sheela",
    "sheela" => "sheela",
    "sheela_num" => "sheela_num",
    "army_rank" => "army_rank",
    
    //matr
    "post_url" => "post_url",
    "publisher_id" => "publisher_id",
    "reactions_count" => "reactions_count",
    "reactors" => "reactors",
    "relation_id" => "relation_id",
    "relation_type" => "relation_type",
    "reshares" => "reshares",
    "share_count" => "share_count",
    "social_network" => "social_network",
    "website_attachments" => "website_attachments", 
    "harvesting_task_id" => "harvesting_task_id",
    "post_id" => "t_id",
    "post_creation_time" => "created_at",
    "harvesting_time" => "harvesting_time",
    "harvesting_type" => "harvesting_type",
    "post_attachments" => "post_attachments",
    "commentators" => "commentators",
    "comments_amount" => "comments_amount"
);


$meta_to_ont_keys = array(
    "user_location" => ['dest_table'=>"user_location_ont", 'type' => 'Place-object']
//     "l_experiences_titles" =>  ['dest_table'=>"title", 'type' => 'Organizational-title'],
//     "l_experiences_companies" => ['dest_table'=>"organizational_identity", 'type' => 'Organizational-identity'],
//     "l_education_school" => ['dest_table'=>"organizational_identity", 'type' => 'Organizational-identity'],
//     "l_experiences" => ['dest_table'=>"professional_person_type", 'type' => 'Professional-person-type'],
//     "l_experiences_titles" => ['dest_table'=>"professional_person_type", 'type' => 'Professional-person-type'], 
//     "l_experiences_companies" => ['dest_table'=>"organizational_identity", 'type' => 'Organizational-identity'], 
//     "l_industry" => ['dest_table'=>"general_commercial_sectors", 'type' => 'General-commercial-sectors'], 
//     "l_personal_emails" => ['dest_table'=>"email", 'type' => 'Email'],
//     "l_languages" => ['dest_table'=>"language_object", 'type' => 'Language-object'], 
//     "l_skills" => ['dest_table'=>"human_resource_professional_proficiency", 'type' => 'Human-resource-professional-proficiency'], 
//     "l_city" => ['dest_table'=>"city", 'type' => 'City'], 
//     "t_country" => ['dest_table'=>"national_identity", 'type' => 'National-identity'], 
//     "l_education_degree_name" => ['dest_table'=>"diploma_cerificate_degree", 'type' => 'Diploma-cerificate-degree']
);

//for crawler parser, to indicate which rules passed the filter
foreach (array_merge(["location", "time_zone", "country_code"], $userAccountsTables ?? []) as $filterTable)
	$extraColumns["filter_passed_rule_$filterTable"] = "Filter by $filterTable";

$extraDataKey2Table = array(// "field-name-in-raw-json-recieved-from-crawler" => "table-name"
    "twitter_lang" => "lang",
    "user_displayName" => "file_author",
    "user_name" => "file_author",
    "created_at" => "date_field", //reuse emails' date_field from twitter created_at - for overwriting file's date
    "created_time" => "date_field", //for facebook
    "user_screen_name" => "user_screen_name",
    "twitter_place" => "place_full_name",
    "user_location" => "user_location",
    "user_lang" => "user_lang",
    "hashtag" => "t_hashtag",
     "id_str" => "t_id",
    "in_reply_to_status_id_str" => "t_parent_id",
    "quoted_status_id_str" => "t_parent_id",
    "FullFolderPath" => "t_type",
    "#type" => "t_type",
    "user_description" => "user_description",
    "date_field" => "date_field",
    "article_date" => "date_field",
    "article_id" => "t_id",
    "author" => "file_author",
    "suggested_keywords" => "t_hashtag",
    "title" => "user_description",
    "url" => "user_url",
    "timestamp" => "date_field",
    "Timestamp" => "date_field",
    "Body" => "text",
    
    "tweet_likes" => "tweet_likes",
    "tweet_shares" => "tweet_shares",
    "tweet_user_url" => "tweet_user_url",
     "network" => "network",
        "photos" => "photos",
    //USA
    "startdatetime" => "date_field",
    "finchpageid" => "t_id",
    "additionalcontext" => "additionalcontext",
    
    //CIL
    "dovra" => "dovra",
    "component"=>"Component",
    "reporter" => "file_author",
    "priority"=>"priority",
    "notice_number" => "notice_number",
    "order_number" => "order_number",
    "main_work_number" => "main_work_number",
    "functional_location" => "functional_location",
    
    "Matnr" => "t_id",
    "Manufacture"=>"manufacture",
    "MPN" => "mpn",
    
    //webhose
    "place_country" => "orig_key",
    
    //linkedin
//     "public_identifier" => "",
//     "l_profile_pic_url" => "",
//     "l_background_cover_image_url" => "",
    "l_first_name" <= "l_first_name",
    "l_last_name" <= "l_last_name",
    "l_full_name" <= "user_screen_name",
    "l_occupation" <= "l_occupation",
//     "l_headline" => "",
    "l_summary" <= "user_description",
    "l_country" <= "t_country",
//     "l_country_full_name" => "",
    "l_city" <= "l_city",
    "l_state" <= "l_state",
    "l_experiences" <= "l_experiences",
    "l_experiences_companies" <= "l_experiences_companies",
    "l_experiences_titles" <= "l_experiences_titles",
    "l_education" <= "l_education",
    "l_education_degree_name" <= "l_education_degree_name",
    "l_education_school" <= "l_education_school",
    "l_education_field_of_study" <= "l_education_field_of_study",
    
    "l_languages" <= "l_languages",
    "l_accomplishment_organisations" <= "l_accomplishment_organisations",
    "l_accomplishment_publications" <= "l_accomplishment_publications",
    "l_accomplishment_honors_awards" <= "l_accomplishment_honors_awards",
    "l_accomplishment_patents" <= "l_accomplishment_patents",
    "l_accomplishment_courses" <= "l_accomplishment_courses",
    "l_accomplishment_projects" <= "l_accomplishment_projects",
    "l_accomplishment_test_scores" <= "l_accomplishment_test_scores",
    "l_volunteer_work" <= "l_volunteer_work",
    "l_certifications" <= "l_certifications",
    "l_connections" <= "",
    //     "l_people_also_viewed" <= "",
    "l_recommendations" <= "l_recommendations",
    "l_activities" <= "l_activities",
    //     "l_similarly_named_profiles" <= "",
    "l_articles" <= "l_articles",
    "l_groups" <= "l_groups",
    "l_skills" <= "l_skills",
    "l_inferred_salary" <= "l_inferred_salary",
    "l_gender" <= "t_gender",
    "l_birth_date" <= "l_birth_date",
    "l_industry" <= "l_industry",
    "l_interests" <= "l_interests",
    "l_extra" <= "l_extra",
    "l_personal_emails" <= "l_personal_emails",
    "l_personal_numbers" <= "l_personal_numbers",
    
    
    //FB
    "user_name" => "t_username",
    "date" => "date_field",
    "facebook_page"=>"fb_page",
    "sad"=>"fb_sad",
    "wow"=>"fb_wow",
    "haha"=>"fb_happy",
    "like"=>"fb_like",
    "love"=>"fb_love",
    "angry"=>"fb_angry",
    "no_comment"=>"fb_no_comment",
    "number_of_shared"=>"fb_shared",
    
    //air
    "sug_sotz" <= "sug_sotz",
    "mashov_miluli" <= "mashov_miluli",
    "rama_irgunit" <= "rama_irgunit",
    "teur_sheela" <= "teur_sheela",
    "sheela" <= "sheela",
    "sheela_num" <= "sheela_num",
    "army_rank" <= "army_rank",
    
    //matr
    "post_url" <= "post_url",
    "publisher_id" <= "publisher_id",
    "reactions_count" <= "reactions_count",
    "reactors" <= "reactors",
    "relation_id" <= "relation_id",
    "relation_type" <= "relation_type",
    "reshares" <= "reshares",
    "share_count" <= "share_count",
    "social_network" <= "social_network",
    "website_attachments" <= "website_attachments",
    "harvesting_task_id" <= "harvesting_task_id",
    "post_id" <= "t_id",
    "post_creation_time" <= "created_at",
    "harvesting_time" <= "harvesting_time",
    "harvesting_type" <= "harvesting_type",
    "post_attachments" <= "post_attachments",
    "commentators" <= "commentators",
    "comments_amount" <= "comments_amount"
);

$timestamp_fields = array(
    "b_date_of_request",
    "b_date_of_update",
    "b_date_of_closing"
);
$solr_skip_fields = array(
    "_version_",
    '"_version_"',
    "username",
    "id"
);

$extraFiledsInsertionFunctions = array(
		/*
		"t_parent_id" => function($mysqli, $tableName, $dcid, $value){
			//insert a link between docId of the post, and the docId of its interactions (meaning, documents whose parentID is equal to the ID of the post)
			sqlQuery($mysqli, "INSERT INTO t_post_interactions (docId,value)
			SELECT docId, \"$dcid\" FROM t_id
			WHERE t_id.value = \"$value\"");
		}
		*/
);

$postBulkInsertionFunctions = array(
	function($mysqli, $bulkIds){
		if (empty($bulkIds))
			return;
		
		$bulkIdsStr = join(", ", $bulkIds);
		sqlQuery($mysqli, "INSERT INTO t_post_interactions (docId,value)
		SELECT t_id.docId, t_parent_id.docId FROM t_id, t_parent_id
		WHERE t_parent_id.docId IN ($bulkIdsStr) AND t_id.value = t_parent_id.value");
	}
);

$specialTableCreationScript = array( //This will be executed INSTEAD on the normal code for $dropdownArr in createDB
        
	
        "tv_parent_author" => array("DROP VIEW IF EXISTS tv_parent_author",
		"CREATE VIEW tv_parent_author AS
		SELECT t_post_interactions.value AS docId, file_author.value
		FROM t_post_interactions INNER JOIN file_author
		ON t_post_interactions.docId = file_author.docId"
		//
		//SELECT p.docId, a.value
		//FROM file_author a, t_parent_id p, t_id i
		//WHERE p.value = i.value AND i.docId = a.docId"
		//
		)
        );


//for tracx
$tracxExtradata = array( //"table-name" => "Label in UI"
	"t_type" => "#Type",
	"t_conversation" => "Conversation",
	"t_relevancy_reason" => "Relevancy Reason",
	"t_mention" => "Mention",
	"t_publish_date" => "Publish Date",
	"t_network" => "Network",
	"t_language" => "Tracx Language",
	"t_total_interactions" => "Total Interactions",
	"t_sentiment" => "Tracx Sentiment",
	"t_tags" => "Tracx Tags",
	"t_workflow_stage" => "Workflow Stage",
	"t_ethnicities" => "Entities",
	"t_country" => "Tracx Country",
	"t_state" => "Tracx State",
	"t_city" => "Tracx City",
	"t_age" => "Author's Age",
	"t_gender" => "Author's Gender",
	"t_members" => "Author's Members",
	"t_score" => "Author's Score",
	"t_author_tags" => "Author's Tags",
	"t_author_id" => "Author's ID",
	"t_id" => "ID num",
	"t_parent_id" => "ParentID",

	//the following is not part of the matedata, but calculated from parent id
	"t_post_interactions" => "Post Interactions"
);

$tracxExtradataKey2table = array(
	"Sentiment" => "t_sentiment",
	"Language" => "t_language",
	"Tags" => "t_tags",
	"Country" => "t_country",
	"State" => "t_state",
	"City" => "t_city"
);

$scale_fields = ["currency", "mood", "positive", "negative"];
$phones_accounts = ["phone","bankAccount", "address"];

$extraColumns = array_merge($tracxExtradata, $extraColumns);
$extraDataKey2Table = array_merge($tracxExtradataKey2table, $extraDataKey2Table);


//a project config might want to adjust the defaults of clusterConfig.php - so it can define function postClusterConfigStep()
if (function_exists('postClusterConfigStep'))
	postClusterConfigStep();


	
$nonAggregatedTreatedAsAggregated = 
    ["Currency",
    "Email",
    "Telephone-number",
     "Full-address",
    "Swift-code",
    "Twitter-hashtag",
    "Twitter-user-name",
    "Url",
    "Iban-account",
    "Credit-card-number",
    "Distance-quantity-number-object",
    "Percentage-amount-object",
    "Weight-amount-object"];

//for fetch columns of table without extra data
$ignore_extra_cols = ["id", "docId", "conceptUrl", "conceptName", "translation", "source", "caption"];
    