/* * Copyright (C) 2008, Nokia * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include "tracker-extract.h" #include "tracker-main.h" #ifdef THREAD_ENABLE_TRACE #warning Main thread traces enabled #endif /* THREAD_ENABLE_TRACE */ #define TRACKER_EXTRACT_GET_PRIVATE(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), TRACKER_TYPE_EXTRACT, TrackerExtractPrivate)) G_DEFINE_QUARK (TrackerExtractError, tracker_extract_error) extern gboolean debug; typedef struct { gint extracted_count; gint failed_count; } StatisticsData; typedef struct { GHashTable *statistics_data; GList *running_tasks; /* used to maintain the running tasks * and stats from different threads */ GMutex task_mutex; /* Thread pool for multi-threaded extractors */ GThreadPool *thread_pool; /* module -> async queue hashtable * for single-threaded extractors */ GHashTable *single_thread_extractors; gboolean disable_shutdown; gboolean disable_summary_on_finalize; gchar *force_module; gint unhandled_count; } TrackerExtractPrivate; typedef struct { TrackerExtract *extract; GCancellable *cancellable; GAsyncResult *res; gchar *file; gchar *mimetype; TrackerMimetypeInfo *mimetype_handlers; /* to be fed from mimetype_handlers */ TrackerExtractMetadataFunc cur_func; GModule *cur_module; guint signal_id; guint success : 1; } TrackerExtractTask; static void tracker_extract_finalize (GObject *object); static void report_statistics (GObject *object); static gboolean get_metadata (TrackerExtractTask *task); static gboolean dispatch_task_cb (TrackerExtractTask *task); G_DEFINE_TYPE(TrackerExtract, tracker_extract, G_TYPE_OBJECT) static void tracker_extract_class_init (TrackerExtractClass *klass) { GObjectClass *object_class; object_class = G_OBJECT_CLASS (klass); object_class->finalize = tracker_extract_finalize; g_type_class_add_private (object_class, sizeof (TrackerExtractPrivate)); } static void statistics_data_free (StatisticsData *data) { g_slice_free (StatisticsData, data); } static void tracker_extract_init (TrackerExtract *object) { TrackerExtractPrivate *priv; priv = TRACKER_EXTRACT_GET_PRIVATE (object); priv->statistics_data = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify) statistics_data_free); priv->single_thread_extractors = g_hash_table_new (NULL, NULL); priv->thread_pool = g_thread_pool_new ((GFunc) get_metadata, NULL, 10, TRUE, NULL); g_mutex_init (&priv->task_mutex); } static void tracker_extract_finalize (GObject *object) { TrackerExtractPrivate *priv; priv = TRACKER_EXTRACT_GET_PRIVATE (object); /* FIXME: Shutdown modules? */ g_hash_table_destroy (priv->single_thread_extractors); g_thread_pool_free (priv->thread_pool, TRUE, FALSE); if (!priv->disable_summary_on_finalize) { report_statistics (object); } g_hash_table_destroy (priv->statistics_data); g_mutex_clear (&priv->task_mutex); G_OBJECT_CLASS (tracker_extract_parent_class)->finalize (object); } static void report_statistics (GObject *object) { TrackerExtractPrivate *priv; GHashTableIter iter; gpointer key, value; priv = TRACKER_EXTRACT_GET_PRIVATE (object); g_mutex_lock (&priv->task_mutex); g_message ("--------------------------------------------------"); g_message ("Statistics:"); g_hash_table_iter_init (&iter, priv->statistics_data); while (g_hash_table_iter_next (&iter, &key, &value)) { GModule *module = key; StatisticsData *data = value; if (data->extracted_count > 0 || data->failed_count > 0) { const gchar *name, *name_without_path; name = g_module_name (module); name_without_path = strrchr (name, G_DIR_SEPARATOR) + 1; g_message (" Module:'%s', extracted:%d, failures:%d", name_without_path, data->extracted_count, data->failed_count); } } g_message ("Unhandled files: %d", priv->unhandled_count); if (priv->unhandled_count == 0 && g_hash_table_size (priv->statistics_data) < 1) { g_message (" No files handled"); } g_message ("--------------------------------------------------"); g_mutex_unlock (&priv->task_mutex); } TrackerExtract * tracker_extract_new (gboolean disable_shutdown, const gchar *force_module) { TrackerExtract *object; TrackerExtractPrivate *priv; if (!tracker_extract_module_manager_init ()) { return NULL; } /* Set extractors */ object = g_object_new (TRACKER_TYPE_EXTRACT, NULL); priv = TRACKER_EXTRACT_GET_PRIVATE (object); priv->disable_shutdown = disable_shutdown; priv->force_module = g_strdup (force_module); return object; } static void notify_task_finish (TrackerExtractTask *task, gboolean success) { TrackerExtract *extract; TrackerExtractPrivate *priv; StatisticsData *stats_data; extract = task->extract; priv = TRACKER_EXTRACT_GET_PRIVATE (extract); /* Reports and ongoing tasks may be * accessed from other threads. */ g_mutex_lock (&priv->task_mutex); if (task->cur_module) { stats_data = g_hash_table_lookup (priv->statistics_data, task->cur_module); if (!stats_data) { stats_data = g_slice_new0 (StatisticsData); g_hash_table_insert (priv->statistics_data, task->cur_module, stats_data); } stats_data->extracted_count++; if (!success) { stats_data->failed_count++; } } else { priv->unhandled_count++; } priv->running_tasks = g_list_remove (priv->running_tasks, task); g_mutex_unlock (&priv->task_mutex); } static gboolean get_file_metadata (TrackerExtractTask *task, TrackerExtractInfo **info_out) { TrackerExtractInfo *info; GFile *file; gchar *mime_used = NULL; *info_out = NULL; file = g_file_new_for_uri (task->file); info = tracker_extract_info_new (file, task->mimetype); g_object_unref (file); if (task->mimetype && *task->mimetype) { /* We know the mime */ mime_used = g_strdup (task->mimetype); } else { tracker_extract_info_unref (info); return FALSE; } /* Now we have sanity checked everything, actually get the * data we need from the extractors. */ if (mime_used) { if (task->cur_func) { g_debug ("Using %s...", task->cur_module ? g_module_name (task->cur_module) : "Dummy extraction"); task->success = (task->cur_func) (info); } g_free (mime_used); } if (!task->success) { tracker_extract_info_unref (info); info = NULL; } *info_out = info; return task->success; } /* This function is called on the thread calling g_cancellable_cancel() */ static void task_cancellable_cancelled_cb (GCancellable *cancellable, TrackerExtractTask *task) { TrackerExtractPrivate *priv; TrackerExtract *extract; extract = task->extract; priv = TRACKER_EXTRACT_GET_PRIVATE (extract); g_mutex_lock (&priv->task_mutex); if (g_list_find (priv->running_tasks, task)) { g_message ("Cancelled task for '%s' was currently being " "processed, _exit()ing immediately", task->file); _exit (0); } g_mutex_unlock (&priv->task_mutex); } static TrackerExtractTask * extract_task_new (TrackerExtract *extract, const gchar *uri, const gchar *mimetype, GCancellable *cancellable, GAsyncResult *res, GError **error) { TrackerExtractTask *task; gchar *mimetype_used; if (!mimetype || !*mimetype) { GFile *file; GFileInfo *info; GError *internal_error = NULL; file = g_file_new_for_uri (uri); info = g_file_query_info (file, G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE, G_FILE_QUERY_INFO_NONE, NULL, &internal_error); g_object_unref (file); if (internal_error) { g_propagate_error (error, internal_error); return NULL; } mimetype_used = g_strdup (g_file_info_get_content_type (info)); g_object_unref (info); g_message ("MIME type guessed as '%s' (from GIO)", mimetype_used); } else { mimetype_used = g_strdup (mimetype); g_message ("MIME type passed to us as '%s'", mimetype_used); } task = g_slice_new0 (TrackerExtractTask); task->cancellable = (cancellable) ? g_object_ref (cancellable) : NULL; task->res = (res) ? g_object_ref (res) : NULL; task->file = g_strdup (uri); task->mimetype = mimetype_used; task->extract = extract; if (task->cancellable) { task->signal_id = g_cancellable_connect (cancellable, G_CALLBACK (task_cancellable_cancelled_cb), task, NULL); } return task; } static void extract_task_free (TrackerExtractTask *task) { if (task->cancellable && task->signal_id != 0) { g_cancellable_disconnect (task->cancellable, task->signal_id); } notify_task_finish (task, task->success); if (task->res) { g_object_unref (task->res); } if (task->cancellable) { g_object_unref (task->cancellable); } if (task->mimetype_handlers) { tracker_mimetype_info_free (task->mimetype_handlers); } g_free (task->mimetype); g_free (task->file); g_slice_free (TrackerExtractTask, task); } static gboolean filter_module (TrackerExtract *extract, GModule *module) { TrackerExtractPrivate *priv; gchar *module_basename, *filter_name; gboolean filter; if (!module) { return FALSE; } priv = TRACKER_EXTRACT_GET_PRIVATE (extract); if (!priv->force_module) { return FALSE; } /* Module name is the full path to it */ module_basename = g_path_get_basename (g_module_name (module)); if (g_str_has_prefix (priv->force_module, "lib") && g_str_has_suffix (priv->force_module, "." G_MODULE_SUFFIX)) { filter_name = g_strdup (priv->force_module); } else { filter_name = g_strdup_printf ("libextract-%s.so", priv->force_module); } filter = strcmp (module_basename, filter_name) != 0; if (filter) { g_debug ("Module filtered out '%s' (due to --force-module='%s')", module_basename, filter_name); } else { g_debug ("Module used '%s' (due to --force-module='%s')", module_basename, filter_name); } g_free (module_basename); g_free (filter_name); return filter; } static gboolean get_metadata (TrackerExtractTask *task) { TrackerExtractInfo *info; #ifdef THREAD_ENABLE_TRACE g_debug ("Thread:%p --> '%s': Collected metadata", g_thread_self (), task->file); #endif /* THREAD_ENABLE_TRACE */ if (g_task_return_error_if_cancelled (G_TASK (task->res))) { extract_task_free (task); return FALSE; } if (!filter_module (task->extract, task->cur_module) && get_file_metadata (task, &info)) { g_task_return_pointer (G_TASK (task->res), info, (GDestroyNotify) tracker_extract_info_unref); extract_task_free (task); } else { /* Reinject the task into the main thread * queue, so the next module kicks in. */ g_idle_add ((GSourceFunc) dispatch_task_cb, task); } return FALSE; } static gpointer single_thread_get_metadata (GAsyncQueue *queue) { if (!tracker_seccomp_init ()) g_assert_not_reached (); while (TRUE) { TrackerExtractTask *task; task = g_async_queue_pop (queue); #ifdef THREAD_ENABLE_TRACE g_debug ("Thread:%p --> '%s': Dispatching in dedicated thread", g_thread_self(), task->file); #endif /* THREAD_ENABLE_TRACE */ get_metadata (task); } return NULL; } /* This function is executed in the main thread, decides the * module that's going to be run for a given task, and dispatches * the task according to the threading strategy of that module. */ static gboolean dispatch_task_cb (TrackerExtractTask *task) { TrackerExtractPrivate *priv; GError *error = NULL; GAsyncQueue *async_queue; GModule *module; #ifdef THREAD_ENABLE_TRACE g_debug ("Thread:%p (Main) <-- '%s': Handling task...\n", g_thread_self (), task->file); #endif /* THREAD_ENABLE_TRACE */ priv = TRACKER_EXTRACT_GET_PRIVATE (task->extract); if (!task->mimetype) { error = g_error_new (tracker_extract_error_quark (), TRACKER_EXTRACT_ERROR_NO_MIMETYPE, "No mimetype for '%s'", task->file); } else { if (!task->mimetype_handlers) { /* First iteration for task, get the mimetype handlers */ task->mimetype_handlers = tracker_extract_module_manager_get_mimetype_handlers (task->mimetype); if (!task->mimetype_handlers) { error = g_error_new (tracker_extract_error_quark (), TRACKER_EXTRACT_ERROR_NO_EXTRACTOR, "No mimetype extractor handlers for uri:'%s' and mime:'%s'", task->file, task->mimetype); } } else { /* Any further iteration, should happen rarely if * most specific handlers know nothing about the file */ if (!tracker_mimetype_info_iter_next (task->mimetype_handlers)) { g_message ("There's no next extractor"); error = g_error_new (tracker_extract_error_quark (), TRACKER_EXTRACT_ERROR_NO_EXTRACTOR, "Could not get any metadata for uri:'%s' and mime:'%s'", task->file, task->mimetype); } else { g_message ("Trying next extractor for '%s'", task->file); } } } if (error) { g_task_return_error (G_TASK (task->res), error); extract_task_free (task); return FALSE; } task->cur_module = module = tracker_mimetype_info_get_module (task->mimetype_handlers, &task->cur_func); if (!task->cur_func) { g_warning ("Discarding task, no module able to handle '%s'", task->file); priv->unhandled_count++; extract_task_free (task); return FALSE; } async_queue = g_hash_table_lookup (priv->single_thread_extractors, module); if (!async_queue) { GThread *thread; /* No thread created yet for this module, create it * together with the async queue used to pass data to it */ async_queue = g_async_queue_new (); thread = g_thread_try_new ("single", (GThreadFunc) single_thread_get_metadata, g_async_queue_ref (async_queue), &error); if (!thread) { g_task_return_error (G_TASK (task->res), error); extract_task_free (task); return FALSE; } /* We won't join the thread, so just unref it here */ g_thread_unref (thread); g_hash_table_insert (priv->single_thread_extractors, module, async_queue); } g_async_queue_push (async_queue, task); return FALSE; } /* This function can be called in any thread */ void tracker_extract_file (TrackerExtract *extract, const gchar *file, const gchar *mimetype, GCancellable *cancellable, GAsyncReadyCallback cb, gpointer user_data) { GError *error = NULL; TrackerExtractTask *task; GTask *async_task; g_return_if_fail (TRACKER_IS_EXTRACT (extract)); g_return_if_fail (file != NULL); g_return_if_fail (cb != NULL); #ifdef THREAD_ENABLE_TRACE g_debug ("Thread:%p <-- '%s': Processing file\n", g_thread_self (), file); #endif /* THREAD_ENABLE_TRACE */ async_task = g_task_new (extract, cancellable, cb, user_data); task = extract_task_new (extract, file, mimetype, cancellable, G_ASYNC_RESULT (async_task), &error); if (error) { g_warning ("Could not get mimetype, %s", error->message); g_task_return_error (async_task, error); } else { TrackerExtractPrivate *priv; priv = TRACKER_EXTRACT_GET_PRIVATE (task->extract); g_mutex_lock (&priv->task_mutex); priv->running_tasks = g_list_prepend (priv->running_tasks, task); g_mutex_unlock (&priv->task_mutex); g_idle_add ((GSourceFunc) dispatch_task_cb, task); } /* Task takes a ref and if this fails, we want to unref anyway */ g_object_unref (async_task); } void tracker_extract_get_metadata_by_cmdline (TrackerExtract *object, const gchar *uri, const gchar *mime, TrackerSerializationFormat output_format) { GError *error = NULL; TrackerExtractPrivate *priv; TrackerExtractTask *task; TrackerExtractInfo *info; gboolean no_data_or_modules = TRUE; priv = TRACKER_EXTRACT_GET_PRIVATE (object); priv->disable_summary_on_finalize = TRUE; g_return_if_fail (uri != NULL); task = extract_task_new (object, uri, mime, NULL, NULL, &error); if (error) { g_printerr ("%s, %s\n", _("Metadata extraction failed"), error->message); g_error_free (error); return; } task->mimetype_handlers = tracker_extract_module_manager_get_mimetype_handlers (task->mimetype); if (task->mimetype_handlers) { task->cur_module = tracker_mimetype_info_get_module (task->mimetype_handlers, &task->cur_func); } while (task->cur_func) { if (!filter_module (object, task->cur_module) && get_file_metadata (task, &info)) { TrackerResource *resource = tracker_extract_info_get_resource (info); if (resource == NULL) break; no_data_or_modules = FALSE; if (output_format == TRACKER_SERIALIZATION_FORMAT_SPARQL) { char *text; /* If this was going into the tracker-store we'd generate a unique ID * here, so that the data persisted across file renames. */ tracker_resource_set_identifier (resource, uri); text = tracker_resource_print_sparql_update (resource, NULL, NULL); g_print ("%s\n", text); g_free (text); } else if (output_format == TRACKER_SERIALIZATION_FORMAT_TURTLE) { char *turtle; /* If this was going into the tracker-store we'd generate a unique ID * here, so that the data persisted across file renames. */ tracker_resource_set_identifier (resource, uri); turtle = tracker_resource_print_turtle (resource, NULL); if (turtle) { g_print ("%s\n", turtle); g_free (turtle); } } tracker_extract_info_unref (info); break; } else { if (!tracker_mimetype_info_iter_next (task->mimetype_handlers)) { break; } task->cur_module = tracker_mimetype_info_get_module (task->mimetype_handlers, &task->cur_func); } } if (no_data_or_modules) { g_printerr ("%s: %s\n", uri, _("No metadata or extractor modules found to handle this file")); } extract_task_free (task); } TrackerExtractInfo * tracker_extract_file_finish (TrackerExtract *extract, GAsyncResult *res, GError **error) { g_return_val_if_fail (TRACKER_IS_EXTRACT (extract), NULL); g_return_val_if_fail (G_IS_ASYNC_RESULT (res), NULL); g_return_val_if_fail (!error || !*error, NULL); return g_task_propagate_pointer (G_TASK (res), error); }

-