From 1759d38a97c8dd27d3b4082d393a44406f22503d Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Fri, 16 Feb 2024 08:57:21 -0500 Subject: [PATCH 01/44] small updates to url-checker yml file exclusions --- .github/workflows/urls-checker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/urls-checker.yml b/.github/workflows/urls-checker.yml index 0f48e8452..1176abd16 100644 --- a/.github/workflows/urls-checker.yml +++ b/.github/workflows/urls-checker.yml @@ -37,7 +37,7 @@ jobs: retry_count: 3 # A comma separated patterns to exclude during URL checks - exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,graph.microsoft.com,login.microsoftonline.com + exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,snowflake,graph.microsoft.com,login.microsoftonline.com,my-host.com, # Exclude these files from the checker exclude_files: Swirl.postman_collection.json,docs/googlec95caf0bd4a8c5df.html,docs/Gemfile,docs/Gemfile.lock,docs/_config.yml,tests/,SearchProviders/ From 5d5d2bd9f448bea415a7a995afce0b25d5970730 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Fri, 16 Feb 2024 08:58:12 -0500 Subject: [PATCH 02/44] remove extra comma --- .github/workflows/urls-checker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/urls-checker.yml b/.github/workflows/urls-checker.yml index 1176abd16..57c946dc3 100644 --- a/.github/workflows/urls-checker.yml +++ b/.github/workflows/urls-checker.yml @@ -37,7 +37,7 @@ jobs: retry_count: 3 # A comma separated patterns to exclude during URL checks - exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,snowflake,graph.microsoft.com,login.microsoftonline.com,my-host.com, + exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,snowflake,graph.microsoft.com,login.microsoftonline.com,my-host.com # Exclude these files from the checker exclude_files: Swirl.postman_collection.json,docs/googlec95caf0bd4a8c5df.html,docs/Gemfile,docs/Gemfile.lock,docs/_config.yml,tests/,SearchProviders/ From b75a0cbe20bdc71060081a61525eb0b8d41ff4c9 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Tue, 20 Feb 2024 20:26:45 -0500 Subject: [PATCH 03/44] fix from dev to main --- swirl_server/settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swirl_server/settings.py b/swirl_server/settings.py index b70f21af2..de539d793 100644 --- a/swirl_server/settings.py +++ b/swirl_server/settings.py @@ -292,7 +292,7 @@ MICROSOFT_CLIENT_SECRET = env('MICROSOFT_CLIENT_SECRET') MICROSOFT_REDIRECT_URI = env('MICROSOFT_REDIRECT_URI') -CSRF_TRUSTED_ORIGINS = ['http://localhost:4200'] +CSRF_TRUSTED_ORIGINS = list(env('CSRF_TRUSTED_ORIGINS').split(',')) SWIRL_WRITE_PATH_DEF = 'stored_results' SWIRL_WRITE_PATH = env('SWIRL_WRITE_PATH', default=SWIRL_WRITE_PATH_DEF) @@ -308,4 +308,4 @@ }, }, } -ASGI_THREADS = 1000 \ No newline at end of file +ASGI_THREADS = 1000 From c61cc3ed4815a7f9986946ab7ce1ea359fe54c41 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Wed, 21 Feb 2024 11:26:29 -0500 Subject: [PATCH 04/44] add a default for CSRF trust --- .env.dist | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.dist b/.env.dist index acf6c79ee..e10448687 100644 --- a/.env.dist +++ b/.env.dist @@ -11,3 +11,4 @@ SQL_PORT=5432 MICROSOFT_CLIENT_ID='' MICROSOFT_CLIENT_SECRET='' MICROSOFT_REDIRECT_URI='' +CSRF_TRUSTED_ORIGINS='http://localhost:8000' From 8f52c3165d766d518984111ce8273e107ffa9ad0 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Wed, 21 Feb 2024 17:31:41 -0500 Subject: [PATCH 05/44] clean CVE-2024-24680 and CVE-2023-50782 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fbfee0411..2a94e265c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,12 +28,12 @@ confection==0.1.4 constantly==23.10.4 coreapi==2.3.3 coreschema==0.0.4 -cryptography==41.0.7 +cryptography==42.0.2 cssselect==1.2.0 cymem==2.0.8 daphne==4.0.0 distro==1.9.0 -Django==5.0.1 +Django==5.0.2 django-celery-beat==2.1.0 django-environ==0.11.2 django-rest-swagger==2.2.0 From 8b1f46691aa3a6b8f2dc6b16b16a0ed3bde8f735 Mon Sep 17 00:00:00 2001 From: dnicodemus <94639381+dnicodemus@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:38:17 -0500 Subject: [PATCH 06/44] Revert "clean CVE-2024-24680 and CVE-2023-50782" --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2a94e265c..fbfee0411 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,12 +28,12 @@ confection==0.1.4 constantly==23.10.4 coreapi==2.3.3 coreschema==0.0.4 -cryptography==42.0.2 +cryptography==41.0.7 cssselect==1.2.0 cymem==2.0.8 daphne==4.0.0 distro==1.9.0 -Django==5.0.2 +Django==5.0.1 django-celery-beat==2.1.0 django-environ==0.11.2 django-rest-swagger==2.2.0 From d6ce4242260b9bbcd571fa9ef4cb60d44d03f953 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Thu, 22 Feb 2024 11:51:11 -0500 Subject: [PATCH 07/44] bump cryptography and Django, remove snowflake. --- .github/workflows/urls-checker.yml | 2 +- SearchProviders/company_snowflake.json | 30 ----- SearchProviders/preloaded.json | 30 ----- docs/Developer-Reference.md | 165 ++++++++++--------------- docs/User-Guide.md | 65 +++++----- requirements.txt | 65 +++++----- swirl/connectors/__init__.py | 1 - swirl/connectors/db_connector.py | 7 +- swirl/connectors/snowflake.py | 92 -------------- swirl/models.py | 3 +- 10 files changed, 128 insertions(+), 332 deletions(-) delete mode 100644 SearchProviders/company_snowflake.json delete mode 100644 swirl/connectors/snowflake.py diff --git a/.github/workflows/urls-checker.yml b/.github/workflows/urls-checker.yml index 57c946dc3..8d5ba78ab 100644 --- a/.github/workflows/urls-checker.yml +++ b/.github/workflows/urls-checker.yml @@ -37,7 +37,7 @@ jobs: retry_count: 3 # A comma separated patterns to exclude during URL checks - exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,snowflake,graph.microsoft.com,login.microsoftonline.com,my-host.com + exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,graph.microsoft.com,login.microsoftonline.com,my-host.com # Exclude these files from the checker exclude_files: Swirl.postman_collection.json,docs/googlec95caf0bd4a8c5df.html,docs/Gemfile,docs/Gemfile.lock,docs/_config.yml,tests/,SearchProviders/ diff --git a/SearchProviders/company_snowflake.json b/SearchProviders/company_snowflake.json deleted file mode 100644 index 804ce3016..000000000 --- a/SearchProviders/company_snowflake.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "Free Company Records - Snowflake", - "active": false, - "default": false, - "authenticator": "", - "connector": "Snowflake", - "url": "", - "query_template": "SELECT {fields} FROM {table} WHERE {field1} ILIKE '%{query_string}%' AND NULLIF(TRIM(founded), '') IS NOT NULL ORDER BY TRY_TO_NUMBER(REGEXP_REPLACE(SPLIT_PART(size, '-', 1), '[^0-9]', '')) DESC;", - "post_query_template": {}, - "http_request_headers": {}, - "page_fetch_config_json": {}, - "query_processors": [ - "AdaptiveQueryProcessor" - ], - "query_mappings": "fields=*,sort_by_date=founded,table=FREECOMPANYDATASET,field1=name", - "result_grouping_field": "", - "result_processors": [ - "MappingResultProcessor", - "CosineRelevancyResultProcessor" - ], - "response_mappings": "", - "result_mappings": "title='{name} ({founded})',body='{name} was founded in {founded} in {country}. It has {size} employees and operates in the {industry} industry.',url='https://{linkedin_url}',date_published=founded,NO_PAYLOAD", - "results_per_query": 10, - "credentials": "::FREE_COMPANY_DATASET:COMPUTE_WH", - "eval_credentials": "", - "tags": [ - "Company", - "Snowflake" - ] -} \ No newline at end of file diff --git a/SearchProviders/preloaded.json b/SearchProviders/preloaded.json index 4392a1b1f..ac8df5317 100644 --- a/SearchProviders/preloaded.json +++ b/SearchProviders/preloaded.json @@ -1390,36 +1390,6 @@ "MongoDB" ] }, - { - "name": "Free Company Records - Snowflake", - "active": false, - "default": false, - "authenticator": "", - "connector": "Snowflake", - "url": "", - "query_template": "SELECT {fields} FROM {table} WHERE {field1} ILIKE '%{query_string}%' AND NULLIF(TRIM(founded), '') IS NOT NULL ORDER BY TRY_TO_NUMBER(REGEXP_REPLACE(SPLIT_PART(size, '-', 1), '[^0-9]', '')) DESC;", - "post_query_template": {}, - "http_request_headers": {}, - "page_fetch_config_json": {}, - "query_processors": [ - "AdaptiveQueryProcessor" - ], - "query_mappings": "fields=*,sort_by_date=founded,table=FREECOMPANYDATASET,field1=name", - "result_grouping_field": "", - "result_processors": [ - "MappingResultProcessor", - "CosineRelevancyResultProcessor" - ], - "response_mappings": "", - "result_mappings": "title='{name} ({founded})',body='{name} was founded in {founded} in {country}. It has {size} employees and operates in the {industry} industry.',url='https://{linkedin_url}',date_published=founded,NO_PAYLOAD", - "results_per_query": 10, - "credentials": "::FREE_COMPANY_DATASET:COMPUTE_WH", - "eval_credentials": "", - "tags": [ - "Company", - "Snowflake" - ] - }, { "name": "Entities - LittleSis.org", "active": false, diff --git a/docs/Developer-Reference.md b/docs/Developer-Reference.md index b2dda6461..c4c4785d4 100644 --- a/docs/Developer-Reference.md +++ b/docs/Developer-Reference.md @@ -22,28 +22,28 @@ This guide is intended to provide developers with detailed reference information The following table describes in more detail all the steps in the federation process, with the associated `status` and other important state information. -| Action | Module | Status | Notes | -| ---------- | ---------- | ---------- | ---------- | +| Action | Module | Status | Notes | +| ---------- | ---------- | ---------- | ---------- | | Search object created | views.py SearchViewSet.list() | Search.status:
NEW_SEARCH
UPDATE_SEARCH | Required:
`Search.query_string` | | Pre-processing | search.py search() | Search.status:
PRE_PROCESSING | Checks permissions
Loads the Search object | | Pre-query processing | search.py search() | Search.status:
PRE_QUERY_PROCESSING | Processes `Search.query_string` and updates `Search.query_string_processed` | | Federation | search.py search() | Search.status:
FEDERATING
FEDERATING_WAIT_*
FULL_RESULTS | Creates one Connector for each SearchProvider in the Search | -| Connector Init | connectors/connector.py
connectors/db_connector.py | Connector.status:
INIT
READY | Loads the Search and SearchProvider | +| Connector Init | connectors/connector.py
connectors/db_connector.py | Connector.status:
INIT
READY | Loads the Search and SearchProvider | | Connector Federate | federate() | Connector.status:
FEDERATING | | | Connector Query Processing| process_query() | FEDERATING | Process `Search.query_string_processed` and store in `Connector.query_string_to_provider` | | Connector Construct Query | construct_query() | FEDERATING | Take `Connector.query_string_to_provider` and create `Connector.query_to_provider` | | Connector Validate Query | validate_query() | FEDERATING | Returns "False" if `Connector.query_to_provider` is empty | -| Connector Execute Search | execute_search () | FEDERATING | Connect to the SearchProvider
Execute the search using `Search.query_to_provider`
Store the response in `Connector.response` | +| Connector Execute Search | execute_search () | FEDERATING | Connect to the SearchProvider
Execute the search using `Search.query_to_provider`
Store the response in `Connector.response` | | Connector Normalize Response | normalize_response() | FEDERATING | Transform `Connector.response` into JSON list of dicts
Store it in `Connector.results` | | Connector Process Results | process_results() | Connector.status:
FEDERATING
READY | Process `Connector.results` | | Connector Save Results | save_results() | Connector.status:
READY | Returns "True" | -| Post-result processing | search.py search() | Search.status:
POST_RESULT_PROCESSING
FULL_RESULTS_READY
FULL_UPDATE_READY | Runs the `post_result_processors`
Updates Result objects | +| Post-result processing | search.py search() | Search.status:
POST_RESULT_PROCESSING
FULL_RESULTS_READY
FULL_UPDATE_READY | Runs the `post_result_processors`
Updates Result objects | # `Search.Status` ## Normal States -| Status | Meaning | +| Status | Meaning | | ---------- | ---------- | | NEW_SEARCH | The search object is to be executed immediately | | UPDATE_SEARCH | The search object is to be updated immediately | @@ -56,21 +56,21 @@ The following table describes in more detail all the steps in the federation pro | PARTIAL_RESULTS | Swirl has received results from some providers, but not all | | POST_RESULT_PROCESSING | Swirl is performing post-result processing | | PARTIAL_RESULTS_READY | Swirl has processed results from responding providers | -| PARTIAL_UPDATE_READY | Swirl has processed updated results from responding providers | -| FULL_RESULTS_READY | Swirl has processed results for all specified providers | -| FULL_UPDATE_READY | Swirl has processed updated results for all specified providers | +| PARTIAL_UPDATE_READY | Swirl has processed updated results from responding providers | +| FULL_RESULTS_READY | Swirl has processed results for all specified providers | +| FULL_UPDATE_READY | Swirl has processed updated results for all specified providers | ## Error States -| Status | Meaning | +| Status | Meaning | | ---------- | ---------- | | ERR_DUPLICATE_RESULT_OBJECTS | More than one Result object was found; [contact support](#support) for assistance. | -| ERR_NEED_PERMISSION | The Django User did not have sufficient permissions to perform the requested operation. More: [Permissioning Normal Users](Admin-Guide.md#permissioning-normal-users) | +| ERR_NEED_PERMISSION | The Django User did not have sufficient permissions to perform the requested operation. More: [Permissioning Normal Users](Admin-Guide.md#permissioning-normal-users) | | ERR_NO_ACTIVE_SEARCHPROVIDERS | Search failed because no specified SearchProviders were active | | ERR_NO_RESULTS | Swirl has not received results from any source | | ERR_NO_SEARCHPROVIDERS | Search failed because no SearchProviders were specified | | ERR_RESULT_NOT_FOUND | A Result object that was expected to be found, was not; [contact support](#support) for assistance. | -| ERR_RESULT_PROCESSING | An error occurred during Result processing - check the `logs/celery-worker.log` for details | +| ERR_RESULT_PROCESSING | An error occurred during Result processing - check the `logs/celery-worker.log` for details | | ERR_SUBSCRIBE_PERMISSIONS | The user who created the Search object lacks permission to enable `subscribe` mode | # Key Module List @@ -88,7 +88,7 @@ A SearchProvider defines some searchable source. It includes metadata identifyin ## Properties -| Property | Description | Default Value (`Example Value`) | +| Property | Description | Default Value (`Example Value`) | | ---------- | ---------- | ---------- | | id | Unique identifier for the SearchProvider | Automatic (`1`) | | name | Human-readable name for the source | "" (`"Enterprise Search PSE"`) | @@ -98,7 +98,7 @@ A SearchProvider defines some searchable source. It includes metadata identifyin | date_updated | The time and date at which the SearchProvdier was updated | Automatic (`2022-02-29T18:03:02.716456Z`) | | active | Boolean setting: if `true` the SearchProvider is used, if `false` it is ignored when federating | false (`true`) | | default | Boolean setting: if `true` the SearchProvider will be queried for searches that don't specify a `searchprovider_list`; if `false`, the SearchProvider must be specified in the `searchprovider_list` | false (`true`) | -| connector | Name of the Connector to use for this source | "" (`"RequestsGet"`) | +| connector | Name of the Connector to use for this source | "" (`"RequestsGet"`) | | url | The URL or other string including file path needed by the Connector for this source; not validated | "" (`"https://www.googleapis.com/customsearch/v1"`) | | query_template | A string with optional variables in form `{variable}`; the Connector will bind the `query_template` with required data including the `url` and `query_string`, as well as any `query_mappings` or `credentials`, at runtime. Note this format is not yet used by the [Sqlite3 Connector](#sqlite3). | "" (`"{url}?q={query_string}"`) | | post_query_template | For the RequestsPost Connector: valid JSON and a marker for the query text which then sent as the POST body | "" (`"query": "{query_string}","limit": "100"`) | @@ -132,27 +132,27 @@ The only required property is a `query_string` with the actual text to be search ## Properties -| Property | Description | Default Value (`Example Value`) | +| Property | Description | Default Value (`Example Value`) | | ---------- | ---------- | ---------- | | id | Unique identifier for the Search | Automatic (`search_id=1`) | | owner | The username of the Django user that owns the object | logged-in user (`admin`) | | date_created | The time and date at which the Search was created | Automatic (`2022-02-28T17:55:04.811262Z`) | | date_updated | The time and date at which the Search was updated | Automatic (`2022-02-28T17:55:07.811262Z`) | -| query_string | The query to be federated - the only required field! | "" (`knowledge management`) | +| query_string | The query to be federated - the only required field! | "" (`knowledge management`) | | query_string_processed | The Search query, modified by any pre-query processing | "" ("") | -| sort | The type of search to be run | relevancy (`date`) | -| results_requested | The number of results, overall, the user has requested | 10 (`25`) | +| sort | The type of search to be run | relevancy (`date`) | +| results_requested | The number of results, overall, the user has requested | 10 (`25`) | | searchprovider_list | A list of the SearchProviders to search for this query; an empty list, the default, searches all sources | [] (`[ "Enterprise Search Engines - Google PSE" ]`) | | subscribe | If `True`, Swirl will update this Search as per the Celery-Beats schedule | False (`True`) | | status | The execution status of this search (see below) | NEW_SEARCH (`FULL_RESULTS_READY`) | | pre_query_processors | A list of processors to apply to the query before federation starts | "" (`[ "SpellcheckQueryProcessor" ]`) | | post_result_processors | A list of result processors to apply to the results after federation is complete | "" (`[ "DedupeByFieldPostResultProcessor", "CosineRelevancyPostResultProcessor" ]`) | | result_url | Link to the initial Result object for the Search which uses the `RelevancyMixer` | Automatic (`"http://localhost:8000/swirl/results?search_id=17&result_mixer=RelevancyMixer"`) | -| new_result_url | Link to the updated Result object for the search which uses the `RelevancyNewItemsMixer` | Automatic (`"http://localhost:8000/swirl/results?search_id=17&result_mixer=RelevancyNewItemsMixer"`) | -| messages | Messages from SearchProviders | "" (`Retrieved 1 of 1 results from: Document DB Search`) | -| result_mixer | The name of the Mixer object (see below) to use for ordering results | RoundRobinMixer (`Stack2Mixer`) | -| retention | The retention setting for this object; `0` = retain indefinitely; see [Search Expiration Service](Admin-Guide.md#search-expiration-service) for details | 0 (`2` for daily deletion) | -| tags | Parameter (string) that can be passed into a search and will be attached to the Search object that is stored in Swirl | "" (`{ "query_string": "knowledge management", "tags": ["max_length:50"] }`) | +| new_result_url | Link to the updated Result object for the search which uses the `RelevancyNewItemsMixer` | Automatic (`"http://localhost:8000/swirl/results?search_id=17&result_mixer=RelevancyNewItemsMixer"`) | +| messages | Messages from SearchProviders | "" (`Retrieved 1 of 1 results from: Document DB Search`) | +| result_mixer | The name of the Mixer object (see below) to use for ordering results | RoundRobinMixer (`Stack2Mixer`) | +| retention | The retention setting for this object; `0` = retain indefinitely; see [Search Expiration Service](Admin-Guide.md#search-expiration-service) for details | 0 (`2` for daily deletion) | +| tags | Parameter (string) that can be passed into a search and will be attached to the Search object that is stored in Swirl | "" (`{ "query_string": "knowledge management", "tags": ["max_length:50"] }`) | {: .highlight } There are some special Search tags that control query processing. For example, the `SW_RESULT_PROCESSOR_SKIP` Search tag can be used to skip a processor for the Search it is specified for: `SW_RESULT_PROCESSOR_SKIP:DedupeByFieldResultProcessor` @@ -172,21 +172,21 @@ There are some special Search tags that control query processing. For example, A Result object is the normalized, re-ranked result for a single Search, from a single SearchProvider. They are created at the end of the federated search process in response to the creation of a Search object. They are the only Swirl object that has a foreign key (`search.id`). -Only Connectors should create Result objects. +Only Connectors should create Result objects. -Developers are free to operate on individual Results as needed for their application. +Developers are free to operate on individual Results as needed for their application. However, the [goal of Swirl](index.md) (and federated search in general) is to provide unified results from all sources. Swirl uses Mixers to make this quick and easy. ## Properties -| Property | Description | `Example Value` | -| ---------- | ---------- | ---------- | +| Property | Description | `Example Value` | +| ---------- | ---------- | ---------- | | id | Unique identifier for the Result | `1` | | owner | The username of the Django user that owns the object | `admin` | | date_created | The time and date at which the Result was created. | `2022-02-28T17:55:04.811262Z` | | date_updated | The time and date at which the Result was updated | `2022-02-28T19:55:02.752962Z` | -| search_id | The `id` of the associated Search; there may be many Result objects with this `id` | `18` | +| search_id | The `id` of the associated Search; there may be many Result objects with this `id` | `18` | | searchprovider | The name value of the SearchProvider that provided this result list | `"OneDrive Files - Microsoft 365"` | | query_to_provider | The exact query sent to the SearchProvider | `https://www.googleapis.com/customsearch/v1?cx=google-search-engine-id&key=google-json-api-key&q=strategy` | | query_processors | The names of the Processors, specified in the SearchProvider, that processed the query | `"AdaptiveQueryProcessor"` | @@ -197,7 +197,7 @@ However, the [goal of Swirl](index.md) (and federated search in general) is to p | retrieved | The number of results Swirl retrieved from this SearchProvider for this query | `10` | | found | The total number of results reported by the SearchProvider for this query | `2309` | | time | The time it took for the SearchProvider to create this result set, in seconds | `1.9` | -| json_results | The normalized JSON results from this SearchProvider | (*See below*) | +| json_results | The normalized JSON results from this SearchProvider | (*See below*) | ## `json_results` @@ -253,14 +253,13 @@ The following table describes the included source Connectors: | PostgreSQL | Searches PostgreSQL database | `url` (connection parameters), `query_template`, `credentials` | | RequestsGet | Searches any web endpoint using HTTP/GET with JSON response, including Google PSE, SOLR, Northern Light and more (see below) | `url`, `credentials` | | RequestsPost | Searches any web endpoint using HTTP/POST with JSON response, including M365 | `url`, `credentials` | -| Snowflake | Searches Snowflake datasets | `credentials`, `database`, `warehouse` | | Sqlite3 | Searches SQLite3 databases | `url` (database file path), `query_template` | -Connectors are specified in, and configured by, SearchProvider objects. +Connectors are specified in, and configured by, SearchProvider objects. ## BigQuery -The [BigQuery connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/bigquery.py) uses the Google Cloud Python package. +The [BigQuery connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/bigquery.py) uses the Google Cloud Python package. The included [BigQuery SearchProvider](https://github.com/swirlai/swirl-search/blob/main/SearchProviders/funding_db_bigquery.json) is intended for use with the [Funding Data Set](#funding-data-set) but can be adapted to most any configuration. @@ -276,7 +275,7 @@ The included [BigQuery SearchProvider](https://github.com/swirlai/swirl-search/b ], "query_mappings": "fields=*,sort_by_date=fundedDate,table=funding.funding,field1=company,field2=city", "result_processors": [ - "MappingResultProcessor", + "MappingResultProcessor", "CosineRelevancyResultProcessor" ], "result_mappings": "title='{company}',body='{company} raised ${raisedamt} series {round} on {fundeddate}. The company is located in {city} {state} and has {numemps} employees.',url=id,date_published=fundeddate,NO_PAYLOAD", @@ -359,7 +358,7 @@ The following three Tags are available for use in the ChatGPT SearchProvider to "CHAT_QUERY_REWRITE_GUIDE:You are a helpful assistant that responds like a pirate captain" ``` -* `CHAT_QUERY_DO_FILTER`: Turn on or off the default internal filter of ChatGPT responses. +* `CHAT_QUERY_DO_FILTER`: Turn on or off the default internal filter of ChatGPT responses. ``` json "CHAT_QUERY_DO_FILTER:false" ``` @@ -532,13 +531,13 @@ As of Swirl 3.1.0, the included [Free Public DB](https://github.com/swirlai/swir ## PostgreSQL -The [PostgreSQL connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/postgresql.py) uses the [psycopg2](https://pypi.org/project/psycopg2/) driver. +The [PostgreSQL connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/postgresql.py) uses the [psycopg2](https://pypi.org/project/psycopg2/) driver. ### Installing the PostgreSQL Driver To use PostgreSQL with Swirl: -* Install [PostgreSQL](https://www.postgresql.org/) +* Install [PostgreSQL](https://www.postgresql.org/) * Modify the system PATH so that `pg_config` from the PostgreSQL distribution runs from the command line * Install `psycopg2` using `pip`: @@ -867,46 +866,6 @@ The [RequestsPost connector](https://github.com/swirlai/swirl-search/blob/main/s [Contact Support](#support) for help getting started. -## Snowflake - -The [Snowflake connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/snowflake.py) uses the `snowflake-connector-python` package to connect to a Snowflake instance. - -The included [Free Company Records](https://github.com/swirlai/swirl-search/blob/main/SearchProviders/company_snowflake.json) SearchProvider is configured to search the `FreeCompanyResearch` dataset available in the Snowflake Marketplace. - -``` json -{ - "name": "Free Company Records - Snowflake", - "active": false, - "default": false, - "authenticator": "", - "connector": "Snowflake", - "url": "", - "query_template": "SELECT {fields} FROM {table} WHERE {field1} ILIKE '%{query_string}%' AND NULLIF(TRIM(founded), '') IS NOT NULL ORDER BY TRY_TO_NUMBER(REGEXP_REPLACE(SPLIT_PART(size, '-', 1), '[^0-9]', '')) DESC;", - "post_query_template": {}, - "http_request_headers": {}, - "page_fetch_config_json": {}, - "query_processors": [ - "AdaptiveQueryProcessor" - ], - "query_mappings": "fields=*,sort_by_date=founded,table=FREECOMPANYDATASET,field1=name", - "result_grouping_field": "", - "result_processors": [ - "MappingResultProcessor", - "CosineRelevancyResultProcessor" - ], - "response_mappings": "", - "result_mappings": "title='{name} ({founded})',body='{name} was founded in {founded} in {country}. It has {size} employees and operates in the {industry} industry.',url='https://{linkedin_url}',date_published=founded,NO_PAYLOAD", - "results_per_query": 10, - "credentials": "::FREE_COMPANY_DATASET:COMPUTE_WH", - "eval_credentials": "", - "tags": [ - "Company", - "Snowflake" - ] -} -``` - -Note: Putting a fixed SQL query in the `query_template` is perfectly acceptable. Anything that doesn't change in the URL can be stored here. ## SQLite3 @@ -967,18 +926,18 @@ This pipeline removes duplicates from the result set prior to relevancy ranking. Query Processors operate queries. The exact field they operate on depends on how they are deployed. | Pipeline | Reads | Updates | -| ---------- | ---------- | ---------- | -| Search.pre_query_processors | `Search.query_string` | `Search.query_string_processed` | -| SearchProvider.query_processors | `Search.query_string_processed` | `.query_string_to_provider` | +| ---------- | ---------- | ---------- | +| Search.pre_query_processors | `Search.query_string` | `Search.query_string_processed` | +| SearchProvider.query_processors | `Search.query_string_processed` | `.query_string_to_provider` | This table describes the query processors included in Swirl: -| Processor | Description | Notes | -| ---------- | ---------- | ---------- | +| Processor | Description | Notes | +| ---------- | ---------- | ---------- | | AdaptiveQueryProcessor | Rewrites queries based on the `query_mappings` for a given SearchProvider | Should not be used as `pre_query_processor` | | ChatGPTQueryProcessor | This query processor asks ChatGPT to rewrite queries based on a configurable prompt. For example it can rewrite queries to be fuzzier, broader, more specific, boolean, or in another language. | Experimental | | GenericQueryProcessor | Removes special characters from the query | | -| SpellcheckQueryProcessor | Uses [TextBlob](https://textblob.readthedocs.io/en/dev/quickstart.html#spelling-correction) to predict and fix spelling errors in `query_string` | Best deployed in a `SearchProvider.query_processor` for sources that need it; not recommended with Google PSEs | +| SpellcheckQueryProcessor | Uses [TextBlob](https://textblob.readthedocs.io/en/dev/quickstart.html#spelling-correction) to predict and fix spelling errors in `query_string` | Best deployed in a `SearchProvider.query_processor` for sources that need it; not recommended with Google PSEs | | NoModQueryProcessor | Only removes leading SearchProvider Tags and does not modify the query terms in any way. | It is intended for repositories that allow non-search characters (such as brackets). | ## Result Processors @@ -987,15 +946,15 @@ Result Processors transform source results into the Swirl format defined in [swi The following table lists the Result Processors included with Swirl: -| Processor | Description | Notes | -| ---------- | ---------- | ---------- | +| Processor | Description | Notes | +| ---------- | ---------- | ---------- | | GenericResultProcessor | Copies results from source format to Swirl format by exact match on name | Recommended for sources that don't need mapping | | MappingResultProcessor | Transforms results from source format to Swirl format, using `SearchProvider.result_mappings` | Default | | LenLimitingResultProcessor | Checks if the `title` and `body` responses from a source exceed a configurable length (set in `swirl_server/settings.py`: `SWIRL_MAX_FIELD_LEN = 512`), truncates anything after that value, and adds an ellipsis ("..."). If the `body` field has been truncated, the processor reports the entire response in a new `body_full` field in the Payload. The default truncation length for can be overridden for a specific SearchProvider using a new Tag value (e.g. `max_length:256`). | Recommended for sources that consistently return lengthy title or body fields; should follow the `MappingResultProcessor`. | | CleanTextResultProcessor | Removes non-alphanumeric characters from the source response. It should be considered for lengthy responses where URLs or other HTML or Markdown syntax appear in results. | Should be installed before the `LenLimitingResultProcessor` when both are used. | | DateFinderResultProcessor | Looks for a date in any a number of formats in the body field of each result item. Should it find one, and the `date_published` for that item is `'unknown'`, it replaces `date_published` with the date extracted from the body, and notes this in the `result.messages`. | This processor can detect the following date formats:
`06/01/23`
`06/01/2023`
`06-01-23`
`06-01-2023`
`jun 1, 2023`
`june 1, 2023` | -| AutomaticPayloadMapperResultProcessor | Profiles response data to find good strings for Swirl's `title`, `body`, and `date_published` fields. It is intended for SearchProviders that would otherwise have few (or no) good `result_mappings` options. | It should be place after the `MappingResultProcessor`. The `result_mappings` field should be blank, except for the optional DATASET directive, which will return only a single Swirl response for each provider response, with the original response in the `payload` field under the `dataset` key. | -| RequireQueryStringInTitleResultProcessor | Drops results that do not contain the `query_string_to_provider` in the result `title` field. | It should be added after the `MappingResultProcessor` and is now included by default in the "LinkedIn - Google PSE" SearchProvider. | +| AutomaticPayloadMapperResultProcessor | Profiles response data to find good strings for Swirl's `title`, `body`, and `date_published` fields. It is intended for SearchProviders that would otherwise have few (or no) good `result_mappings` options. | It should be place after the `MappingResultProcessor`. The `result_mappings` field should be blank, except for the optional DATASET directive, which will return only a single Swirl response for each provider response, with the original response in the `payload` field under the `dataset` key. | +| RequireQueryStringInTitleResultProcessor | Drops results that do not contain the `query_string_to_provider` in the result `title` field. | It should be added after the `MappingResultProcessor` and is now included by default in the "LinkedIn - Google PSE" SearchProvider. | ## Post Result Processors @@ -1013,8 +972,8 @@ The relevancy model is as follows: * Aggregates the similarity of: - * the entire query and the field, with the score being the highest in any single sentence (if any), - + * the entire query and the field, with the score being the highest in any single sentence (if any), + * the entire query and a window of text around the field match * the 1 and 2 word combinations in the query (if long enough), and a window of text around the field match @@ -1029,7 +988,7 @@ The relevancy model is as follows: * Normalizes the query executed by this SearchProvider vs. all the other queries in the set - this is reflected in the `query_length_adjust` in the `explain` structure. -The Swirl score is just that: a score. The higher a score is, the more contextually relevant the result is. Scores aren't comparable between queries or results. +The Swirl score is just that: a score. The higher a score is, the more contextually relevant the result is. Scores aren't comparable between queries or results. *Tip: to translate a result score to a confidence score, take the #1 result as 1.0, and then divide subsequent results by the score for that result to calculate the confidence.* @@ -1060,16 +1019,16 @@ The Galaxy UI will not display the correct number of results if this ResultProce The following table details the Result Mixers included with Swirl: | Mixer | Description | Notes | -| ---------- | ---------- | ---------- | +| ---------- | ---------- | ---------- | | RelevancyMixer | Organizes results by [relevancy](User-Guide.md#relevancy-ranking) score (descending), then source rank (ascending) | The default; depends on `relevancy_processor` being installed as the `search.post_result_processors` (also the default) | | RelevancyNewItemsMixer | Organizes results as above, but hiding results that don't have the `new` field as [created during Search updates](Developer-Guide.md#update-a-search) | This is the default for `search.new_result_url`| -| DateMixer | Organizes results by `date_published`. Results with "unknown" for `date_published` are omitted | Use when you want date sorted results | -| DateNewItemsMixer | Organizes results as above, but hiding results that don't have the `new` field as [created during Search updates](Developer-Guide.md#update-a-search) | This is the default for `search.new_result_url` when `search.date` is set to `sort` | -| RoundRobinMixer | Organizes results by taking 1 result from each responding SearchProvider, alternating; actually calls `Stack1Mixer` (see below) | Good for searches with `search.sort` set to "date" or anytime you want a cross-section of results instead of just the ones with the most evidence | +| DateMixer | Organizes results by `date_published`. Results with "unknown" for `date_published` are omitted | Use when you want date sorted results | +| DateNewItemsMixer | Organizes results as above, but hiding results that don't have the `new` field as [created during Search updates](Developer-Guide.md#update-a-search) | This is the default for `search.new_result_url` when `search.date` is set to `sort` | +| RoundRobinMixer | Organizes results by taking 1 result from each responding SearchProvider, alternating; actually calls `Stack1Mixer` (see below) | Good for searches with `search.sort` set to "date" or anytime you want a cross-section of results instead of just the ones with the most evidence | | Stack1Mixer | Organizes results by taking 1 result from each responding SearchProvider, alternating | Good for cross-sections of data | | Stack2Mixer | Organizes results by taking 2 from each responding SearchProvider, alternating | Good for cross-sections of data with 4-6 sources | -| Stack3Mixer | Organizes results by taking 3 from each responding SearchProvider, alternating | Good for cross-sections of data with few sources | -| StackNMixer | Organizes results by taking `N` from each responding source, where `N` if not specified is the number of results requested divided by the number of SearchProviders reporting at least 1 result | Good for cross-sections of data with few providers | +| Stack3Mixer | Organizes results by taking 3 from each responding SearchProvider, alternating | Good for cross-sections of data with few sources | +| StackNMixer | Organizes results by taking `N` from each responding source, where `N` if not specified is the number of results requested divided by the number of SearchProviders reporting at least 1 result | Good for cross-sections of data with few providers | ## Date Mixer @@ -1083,7 +1042,7 @@ For example: [http://localhost:8000/swirl/results?search_id=1&result_mixer=DateM ## NewItems Mixers -The two NewItems mixers automatically filter results to items with the `new` field present. Both will report the number of results hidden because they do not have this field. +The two NewItems mixers automatically filter results to items with the `new` field present. Both will report the number of results hidden because they do not have this field. To remove the `new` field from all results in a search, add `&mark_all_as_read=1` to the `result_mixer` URL property. For example: @@ -1101,7 +1060,7 @@ To invoke the mixer specified using the `result_mixer` property of the Search ob http://localhost:8000/swirl/results/?search_id=1 ``` -If you use the Swirl defaults, a search will produce a JSON result that is relevancy ranked. +If you use the Swirl defaults, a search will produce a JSON result that is relevancy ranked. To specify a different Mixer, add `&result_mixer=mixer-name` to the URL. @@ -1114,7 +1073,7 @@ The following table describes the Mixer wrapper in more detail: | Field | Description | | ---------- | ---------- | | messages | All messages from the Search and all SearchProviders | -| info | A dictionary of Found and Retrieved counts from each SearchProvider | +| info | A dictionary of Found and Retrieved counts from each SearchProvider | | info - search | Information about the Search, including the processed query, and links to re-run and re-score Searches | | info - results | Information about the Results, including the number retrieved and the URL of the next (and previous) pages of results | | results | Mixed Results from the specified Search | @@ -1123,10 +1082,10 @@ The following table describes the Mixer wrapper in more detail: ## Funding Data Set -The TechCrunch Continental USA funding data set was taken from [Insurity SpatialKey](https://support.spatialkey.com/spatialkey-sample-csv-data/). It is included with Swirl in [Data/funding_db.csv](https://github.com/swirlai/swirl-search/blob/main/Data/funding_db.csv) -This file was processed with [scripts/fix_csv.py](https://github.com/swirlai/swirl-search/blob/main/scripts/fix_csv.py) prior to loading into SQLite3. +The TechCrunch Continental USA funding data set was taken from [Insurity SpatialKey](https://support.spatialkey.com/spatialkey-sample-csv-data/). It is included with Swirl in [Data/funding_db.csv](https://github.com/swirlai/swirl-search/blob/main/Data/funding_db.csv) +This file was processed with [scripts/fix_csv.py](https://github.com/swirlai/swirl-search/blob/main/scripts/fix_csv.py) prior to loading into SQLite3. -### Loading into SQLite3 +### Loading into SQLite3 1. Activate [sqlite_web](Admin-Guide.md#sqlite-web) Then, from the swirl-home directory: @@ -1273,7 +1232,7 @@ Results should appear in the right-hand pane: "author" : "Phillip K Allen", "to" : "pallen70@hotmail.com", "subject" : "Investment Structure", - "content" : """---------------------- Forwarded by - + "content" : """---------------------- Forwarded by + ... ``` diff --git a/docs/User-Guide.md b/docs/User-Guide.md index 5f5fa4510..3748432f6 100644 --- a/docs/User-Guide.md +++ b/docs/User-Guide.md @@ -16,20 +16,20 @@ nav_order: 3 ## Intended Audience -This guide is intended for developers, data scientists, program managers, or anyone who wants to use Swirl, including searching and customizing SearchProviders. +This guide is intended for developers, data scientists, program managers, or anyone who wants to use Swirl, including searching and customizing SearchProviders. For background information on Swirl, please review the [Swirl Overview](index.md). # Terminology -| Word | Explanation | +| Word | Explanation | | ---------- | ---------- | | SearchProvider | An object defining a searchable source. It includes metadata identifying the type of connector used to search the source and more. | | Search | An object defining a query that a user or system desires to run. It includes the `query_string` with the actual text and metadata. Most of the metadata is optional.| | Query | Search engines distinguish between the act of searching and the terms used for searching, which are usually referred to as a query. Swirl follows this convention whenever possible but may refer to a search as a query at times. | | Subscribe | An important property of Search objects. When set to `true`, Swirl periodically reruns the search, specifying a date sort to get newer data, and removing duplicates from results.| | Connector | A Swirl module that can connect to, and query, a particular type of data source. Connectors are a wrapper around some existing Python package such as `request.get` or `elasticsearch`.| -| Relevancy Ranking | An estimation of the relative value of a given search engine result to the user's query, as compared to all others - to put it simply. For more information: [https://en.wikipedia.org/wiki/Relevance_(information_retrieval)](https://en.wikipedia.org/wiki/Relevance_(information_retrieval)) | +| Relevancy Ranking | An estimation of the relative value of a given search engine result to the user's query, as compared to all others - to put it simply. For more information: [https://en.wikipedia.org/wiki/Relevance_(information_retrieval)](https://en.wikipedia.org/wiki/Relevance_(information_retrieval)) | # Running a Search @@ -49,7 +49,7 @@ If the search page appears, click `Log Out` at the top, right. The Swirl login p ![Swirl Results Source Facet](images/swirl_results_source-galaxy_dark.png) -Swirl returns the best results from all available sources by default. To filter results by one or more sources, check one or more of the `Source` boxes as shown above. Results are instantly filtered to just those sources. +Swirl returns the best results from all available sources by default. To filter results by one or more sources, check one or more of the `Source` boxes as shown above. Results are instantly filtered to just those sources. Click `Clear All` to return to viewing all results. @@ -79,9 +79,9 @@ Click the Swirl logo (top left of the page) at any time to reset the Galaxy sear The following table summarizes the current Swirl search syntax options: -| Syntax | Handling | Notes | +| Syntax | Handling | Notes | | ---------- | ---------- | ---------- | -| AND, OR | Passed down to all SearchProviders | Swirl does not verify compliance | +| AND, OR | Passed down to all SearchProviders | Swirl does not verify compliance | | NOT, -term | Passed down to configured SearchProviders and rewritten if necessary; removed from the query for providers that don't support `NOT` or `-term` | Swirl verifies compliance; and also down-weights and flags responses that included NOT-ed terms | | tag:term | Passes `term` to the SearchProviders configured with it in their `tags` field. The untagged portion of the query is discarded. If `tag:` begins the query, then only providers with that Tag are searched. | Example: `electric vehicle company:tesla`
Only the term `tesla` will go to SearchProviders with the `company` Tag, so long as they are active.
Example: `company:facebook`
The query `facebook` will only go to SearchProviders with the `company` Tag. | @@ -212,17 +212,16 @@ Swirl includes five (5) Google Programmable Search Engines (PSEs) to get you up [SearchProvider Example JSON](https://github.com/swirlai/swirl-search/tree/main/SearchProviders) | SearchProvider | Description | Notes | -| ---------- | ---------- | ---------- | +| ---------- | ---------- | ---------- | | arxiv.json | Searches the [arXiv.org](https://arxiv.org/) repository of scientific papers | No authorization required | | asana.json | Searches Tasks in [Asana](https://asana.com/) | Requires an Asana personal access token | | atlassian.json | Searches Atlassian [Confluence Cloud](https://www.atlassian.com/software/confluence), [Jira Cloud](https://www.atlassian.com/software/jira), and [Trello](https://trello.com/) Cards. | Requires a bearer token and/or Trello API key; Confluence searches the [CQL `text~` content](https://developer.atlassian.com/server/confluence/performing-text-searches-using-cql/) and Jira searches the [JQL `text~` content](https://support.atlassian.com/jira-software-cloud/docs/what-is-advanced-searching-in-jira-cloud/#Advancedsearching-textPerformingtextsearches) | | blockchain-bitcoin.json | Searches [Blockchain.com](https://www.blockchain.com/) for specific Bitcoin Addresses (wallets) and Transactions IDs (hashes) | Requires a Blockchain.com API key | | chatgpt.json | ChatGPT AI chatbot | Requires an OpenAI API key | -| company_snowflake.json | Searches the [Snowflake](https://www.snowflake.com/en/) `FreeCompanyResearch` dataset | Requires a Snowflake username and password | | crunchbase.json | Searches organizations via the [Crunchbase](https://www.crunchbase.com/) basic API | Requires a Crunchbase.com API key | | document_db.json | SQLite3 document database | [documents_db.csv](https://github.com/swirlai/swirl-search/tree/main/Data/documents_db.csv) | | elastic_cloud.json | elasticsearch, cloud version | [Enron Email Dataset](Developer-Reference.md#enron-email-data-set) Requires cloud_id, credentials | -| elasticsearch.json | elasticsearch, local install | [Enron Email Dataset](Developer-Reference.md#enron-email-data-set) Requires host, port, credentials | +| elasticsearch.json | elasticsearch, local install | [Enron Email Dataset](Developer-Reference.md#enron-email-data-set) Requires host, port, credentials | | europe_pmc.json | Searches the [EuropePMC.org](https://europepmc.org/) repository of life-sciences literature | No authorization required | | funding_db_bigquery.json | BigQuery funding database | [Funding Dataset](Developer-Reference.md#funding-data-set) | | funding_db_postgres.json | PostgreSQL funding database | [Funding Dataset](Developer-Reference.md#funding-data-set) | @@ -231,17 +230,17 @@ Swirl includes five (5) Google Programmable Search Engines (PSEs) to get you up | google_news.json | Searches the [Google News](https://news.google.com/) feed | No authorization required | | google_pse.json | Five Google Programmable Search Engines (PSE) | Includes shared Swirl credentials; may return a 429 error if overused | | hacker_news.json | Queries a [searchable version](https://hn.algolia.com/) of the Hacker News feeds | No authorization required | -| http_get_with_auth.json | Generic HTTP GET query with basic authentication | Requires url, credentials | +| http_get_with_auth.json | Generic HTTP GET query with basic authentication | Requires url, credentials | | http_post_with_auth.json | Generic HTTP POST query with basic authentication | Requires url, credentials | -| hubspot.json | Searches the HubSpot CRM for Companies, Contacts, and Deals | Requires a bearer token | +| hubspot.json | Searches the HubSpot CRM for Companies, Contacts, and Deals | Requires a bearer token | | internet_archive.json | Searches the [Internet Archive Library](https://archive.org/) of items | No authorization required | -| littlesis.json | Searches the free [LittleSis.org](https://littlesis.org/) database of "who-knows-who at the heights of business and government" | No authorization required | +| littlesis.json | Searches the free [LittleSis.org](https://littlesis.org/) database of "who-knows-who at the heights of business and government" | No authorization required | | microsoft.json | Searches M365 Outlook Messages, Calendar Events, OneDrive Files, SharePoint Sites, and Teams Chat | See the [M365 Guide](M365-Guide.md) for details | | miro.json | [Miro.com](https://miro.com) drawing service | Requires a bearer token | | movies_mongodb.json | Searches the [Mongodb Atlas](https://www.mongodb.com/) `sample_mflix` collection, `movies` sample table | Requires database username and password, plus Atlas cluster URL | -| newsdata_io.json | Newsdata.io internet news source | Requires username and password
archive provider also included | +| newsdata_io.json | Newsdata.io internet news source | Requires username and password
archive provider also included | | nlresearch.json | NLResearch.com is a premium and internet content search engine from [Northern Light](https://northernlight.com/) | Requires username and password | -| open_sanctions.json | Searches the [OpenSanctions.org](https://www.opensanctions.org/) database of sanctions targets and persons of interest | Requires and OpenSanctions API key | +| open_sanctions.json | Searches the [OpenSanctions.org](https://www.opensanctions.org/) database of sanctions targets and persons of interest | Requires and OpenSanctions API key | | opensearch.json | OpenSearch 2.x | [Developer Guide](Developer-Reference.md#elastic--opensearch) | | oracle.json | Tested against [Oracle](https://www.oracle.com/) 23c Free (and presumably supporting earlier versions) | Requires Oracle username and password | | preloaded.json | All preloaded SearchProviders | Defaults in the Swirl distribution | @@ -277,7 +276,7 @@ Swirl includes five (5) Google Programmable Search Engines (PSEs) to get you up * A new Google PSE SearchProvider that targets the [new Swirl documentation website](https://docs.swirl.today/) is included and enabled by default. * The EuropePMC SearchProvider is preloaded, set to active status, and configured to participate in Retrieval Augmented Generation (RAG) by default. -* As of Release 3.1.0, Swirl includes SearchProviders for [Asana](https://asana.com/) Tasks, [Atlassian Trello](https://trello.com/) Cards, [Internet Archive Library](https://archive.org/) items, [Mongodb Atlas](https://www.mongodb.com/), [Oracle](https://www.oracle.com/) (WIP), and [Snowflake](https://www.snowflake.com/en/). +* As of Release 3.1.0, Swirl includes SearchProviders for [Asana](https://asana.com/) Tasks, [Atlassian Trello](https://trello.com/) Cards, [Internet Archive Library](https://archive.org/) items, [Mongodb Atlas](https://www.mongodb.com/), [Oracle](https://www.oracle.com/) (WIP). * As of Release 3.2.0, Swirl includes SearchProviders for [LittleSis.org](https://littlesis.org/) and [OpenSanctions.org](https://www.opensanctions.org/) entity searching. @@ -308,7 +307,7 @@ If you have the raw JSON of SearchProvider, install it by copying/pasting into t 3. Paste one SearchProvider's JSON at a time into the form and press the `POST` button 4. Swirl will respond with the finished SearchProvider -As of Swirl 3.2.0, you can copy/paste lists of SearchProviders into the endpoint, and Swirl will load them all. +As of Swirl 3.2.0, you can copy/paste lists of SearchProviders into the endpoint, and Swirl will load them all. ## Bulk Loading @@ -327,7 +326,7 @@ python swirl_load.py SearchProviders/provider-name.json -u admin -p your-admin-p ## Editing -Edit any SearchProvider by adding the `id` to the end of the `/swirl/searchproviders` URL. +Edit any SearchProvider by adding the `id` to the end of the `/swirl/searchproviders` URL. For example: `http://localhost:8000/swirl/searchproviders/1/` @@ -336,7 +335,7 @@ For example: `http://localhost:8000/swirl/searchproviders/1/` From here, you can use the form at the bottom of the page to: * DELETE this SearchProvider, forever -* Edit the configuration of the SearchProvider and `PUT` the changes +* Edit the configuration of the SearchProvider and `PUT` the changes ## Query Templating @@ -346,7 +345,7 @@ Most SearchProviders require a `query_template`. This is usually bound to `query "query_template": "{'$text': {'$search': '{query_string}'}}", ``` -This format is not actually JSON, but rather a string. The single quotes are required, so that the JSON can use double quotes. +This format is not actually JSON, but rather a string. The single quotes are required, so that the JSON can use double quotes. As of Swirl 3.2.0, MongoDB all use the new `query_template_json` field, which stores the template as JSON. For example, here is the new MongoDB `query_template_json`: @@ -372,18 +371,18 @@ The suggestion is that SearchProviders who are good for most any search be left ## Query Mappings -SearchProvider `query_mappings` are key/value pairs that define how to query a given SearchProvider. +SearchProvider `query_mappings` are key/value pairs that define how to query a given SearchProvider. They include field mappings and configurations that Swirl's processors (like the `AdaptiveQueryProcessor`) use to align the query with each SearchProvider's capabilities. The following table summarizes the current `query_mappings` options: -| Mapping Format | Meaning | Example | +| Mapping Format | Meaning | Example | | ---------- | ---------- | ---------- | | key = value | Replace `key` with `value` if the `key` is enclosed in braces in the `provider.query_template`. | ```"query_template": "{url}?cx={cx}&key={key}&q={query_string}","query_mappings": "cx=google-pse-key"``` | -| DATE_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if date sorting is specified in the search object. | `"query_mappings": "DATE_SORT=sort=date"` | -| RELEVANCY_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if relevancy sorting is specified in the search object. | `"query_mappings": "RELEANCY_SORT=sort=relevancy"` | -| PAGE=url-snippet | This identifies the string to insert into the URL for this SearchProvider for paging support. The specification should include either Swirl variable `RESULT_INDEX` or `RESULT_PAGE` which will be the result number (e.g. 11) or page number (e.g. 2) | `"query_mappings": "PAGE=start=RESULT_INDEX"` | +| DATE_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if date sorting is specified in the search object. | `"query_mappings": "DATE_SORT=sort=date"` | +| RELEVANCY_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if relevancy sorting is specified in the search object. | `"query_mappings": "RELEANCY_SORT=sort=relevancy"` | +| PAGE=url-snippet | This identifies the string to insert into the URL for this SearchProvider for paging support. The specification should include either Swirl variable `RESULT_INDEX` or `RESULT_PAGE` which will be the result number (e.g. 11) or page number (e.g. 2) | `"query_mappings": "PAGE=start=RESULT_INDEX"` | | NOT=True | If present, this SearchProvider supports simple, single NOT operators | elon musk NOT twitter | | NOT_CHAR=- | If present, this SearchProvider supports `-term` NOT operators | elon musk -twitter | @@ -400,7 +399,7 @@ For `query_mappings`, keys that appear in the `query_template` wrapped in braces "query_mappings": "cx=0c38029ddd002c006,DATE_SORT=sort=date,PAGE=start=RESULT_INDEX", ``` -At federation time, this becomes the following URL: +At federation time, this becomes the following URL: ``` shell https://www.googleapis.com/customsearch/v1?cx=0c38029ddd002c006&q=some_query_string @@ -463,7 +462,7 @@ The `credentials` property stores any required authentication information for th ### key=value format -This credential is bound to the URL that is used to execute searches. +This credential is bound to the URL that is used to execute searches. For example, from a Google PSE: @@ -492,7 +491,7 @@ X-Api-Keys are supported by the `RequestsGet` and `RequestsPost` connectors. The ### HTTPBasicAuth, HTTPDigestAuth, HTTPProxyAuth -These methods are supported by the `RequestsGet`, `ElasticSearch` and `OpenSearch` connectors. +These methods are supported by the `RequestsGet`, `ElasticSearch` and `OpenSearch` connectors. For example, from the [Solr with Auth SearchProvider](https://github.com/swirlai/swirl-search/blob/main/SearchProviders/solr_with_auth.json): @@ -516,7 +515,7 @@ Here is the `response_mappings` from a Google PSE: The following table summarizes the `response_mappings` options: -| Mapping | Source_JSONPath | Required? | Example | +| Mapping | Source_JSONPath | Required? | Example | | ---------- | ---------- | ---------- | ---------- | | FOUND | Number of results for a given query, for this SearchProvider, e.g. 1,413
Same as `RETRIEVED` if not specified | No | `searchInformation.totalResults=FOUND` | | RETRIEVED | Number of results returned for a given query, for this SearchProvider, e.g. 10
Length of the `RESULTS` list (see below) if not specified | No | `queries.request[0].count=RETRIEVED` | @@ -568,7 +567,7 @@ Swirl will automatically convert this format to a JSON array of dicts, with the ### Multiple Mappings -As of version 1.6, Swirl can map multiple SearchProvider fields to a single Swirl field, aggregating multiple responses in the PAYLOAD field as necessary. +As of version 1.6, Swirl can map multiple SearchProvider fields to a single Swirl field, aggregating multiple responses in the PAYLOAD field as necessary. For example: @@ -602,8 +601,8 @@ If only one field, `content` or `description`, are populated for a response, the The following table explains the `result_mappings` options: -| Mapping Format | Meaning | Example | -| ---------- | ---------- | ---------- | +| Mapping Format | Meaning | Example | +| ---------- | ---------- | ---------- | | swirl_key = source_key | This maps a key from the source provider's result list to Swirl's result list. The `source_key` may be a JSON path. | `body=_source.email` | | swirl_key = source_key1\|source_key2\|source_keyN | This maps multiple keys from the source provider's result list to Swirl's result list; as [noted above](#multiple-mappings) the first populated field is mapped and the rest are copied to the PAYLOAD | `body=content\|description,...` | | swirl_key='template {variable} etc' | This allows any number of source provider result fields to be turned into a string that is then copied to a Swirl field (like `body`) or the PAYLOAD. Commas (,) are not supported in the string at this time. | `'{x}: {y}'=title` | @@ -612,7 +611,7 @@ The following table explains the `result_mappings` options: | sw_btcconvert | An optional directive which will convert the provided Satoshi value to Bitcoin; it can be used anyplace in the template such as `result_mappings` | `sw_btcconvert()` | | NO_PAYLOAD | By default, Swirl copies all result keys from the SearchProvider to the PAYLOAD. If `NO_PAYLOAD` is specified, Swirl copies only the explicitly mapped fields.| `NO_PAYLOAD` | | FILE_SYSTEM | If specified, Swirl will assume that this SearchProvider is a file system and weight matches against the `body` higher. | `FILE_SYSTEM` | -| LC_URL | If specified, Swirl will convert the `url` field to lower case. | `LC_URL` | +| LC_URL | If specified, Swirl will convert the `url` field to lower case. | `LC_URL` | | BLOCK | As of Release 3.1.0, this feature is used exclusively by Swirl's RAG processing; that output appears in this `info` block of the Result object. | `BLOCK=ai_summary` | #### Date Published Display @@ -636,7 +635,7 @@ The `json_result` schema for each result in the Result list is defined by the `c [Result mixers](Developer-Reference.md#mixers-1) further manipulate and re-organize the data from multiple results. -The Result schema can be seen in [`swirl/models.py`](https://github.com/swirlai/swirl-search/tree/main/swirl/models.py) +The Result schema can be seen in [`swirl/models.py`](https://github.com/swirlai/swirl-search/tree/main/swirl/models.py) ## PAYLOAD Field diff --git a/requirements.txt b/requirements.txt index fbfee0411..ef0020813 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ amqp==5.2.0 annotated-types==0.6.0 -anyio==4.2.0 +anyio==4.3.0 asgiref==3.7.2 -asn1crypto==1.5.1 attrs==23.2.0 autobahn==23.6.2 Automat==22.10.0 @@ -13,7 +12,7 @@ bs4==0.0.2 cachetools==5.3.2 catalogue==2.0.10 celery==5.3.6 -certifi==2023.11.17 +certifi==2024.2.2 cffi==1.16.0 channels==4.0.0 channels-redis==4.2.0 @@ -28,35 +27,33 @@ confection==0.1.4 constantly==23.10.4 coreapi==2.3.3 coreschema==0.0.4 -cryptography==41.0.7 +cryptography==42.0.4 cssselect==1.2.0 cymem==2.0.8 -daphne==4.0.0 +daphne==4.1.0 distro==1.9.0 -Django==5.0.1 +Django==5.0.2 django-celery-beat==2.1.0 django-environ==0.11.2 django-rest-swagger==2.2.0 django-restframework==0.0.1 django-timezone-field==4.2.3 djangorestframework==3.14.0 -dnspython==2.5.0 +dnspython==2.6.1 docutils==0.20.1 drf-yasg==1.21.7 elastic-transport==8.12.0 elasticsearch==8.12.0 -en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc -filelock==3.13.1 -google-api-core==2.16.1 -google-auth==2.27.0 -google-cloud-bigquery==3.17.1 +google-api-core==2.17.1 +google-auth==2.28.1 +google-cloud-bigquery==3.17.2 google-cloud-core==2.4.1 google-crc32c==1.5.0 google-resumable-media==2.7.0 googleapis-common-protos==1.62.0 h11==0.14.0 -httpcore==1.0.2 -httpx==0.26.0 +httpcore==1.0.4 +httpx==0.27.0 hyperlink==21.0.0 idna==3.6 incremental==22.10.0 @@ -68,37 +65,36 @@ jsonpath-ng==1.6.1 kombu==5.3.5 langcodes==3.3.0 lxml==5.1.0 -MarkupSafe==2.1.4 +MarkupSafe==2.1.5 msal==1.26.0 msgpack==1.0.7 murmurhash==1.0.10 natsort==8.4.0 nltk==3.8.1 -numpy==1.26.3 -openai==1.6.1 +numpy==1.26.4 +openai==1.12.0 openapi-codec==1.3.2 opensearch-py==2.4.2 oracledb==2.0.1 packaging==23.2 pika==1.3.2 -platformdirs==3.11.0 ply==3.11 preshed==3.0.9 prompt-toolkit==3.0.43 -protobuf==4.25.2 +protobuf==4.25.3 psycopg2==2.9.9 pyahocorasick==2.0.0 pyasn1==0.5.1 pyasn1-modules==0.3.0 pycparser==2.21 -pydantic==2.6.0 -pydantic_core==2.16.1 +pydantic==2.6.1 +pydantic_core==2.16.2 PyJWT==2.8.0 -pymongo==4.6.1 -pyOpenSSL==23.3.0 +pymongo==4.6.2 +pyOpenSSL==24.0.0 python-crontab==3.0.0 python-dateutil==2.8.2 -pytz==2023.4 +pytz==2024.1 PyYAML==6.0.1 readability-lxml==0.8.1 redis==5.0.1 @@ -106,37 +102,34 @@ regex==2023.12.25 requests==2.31.0 rsa==4.9 service-identity==24.1.0 -setuptools==69.0.3 +setuptools==69.1.0 simplejson==3.19.2 six==1.16.0 smart-open==6.4.0 sniffio==1.3.0 -snowflake-connector-python==3.7.0 -sortedcontainers==2.4.0 soupsieve==2.5 -spacy==3.7.2 +spacy==3.7.4 spacy-legacy==3.0.12 spacy-loggers==1.0.5 sqlparse==0.4.4 srsly==2.4.8 statistics==1.0.3.5 -textblob==0.17.1 -thinc==8.2.2 +textblob==0.18.0.post0 +thinc==8.2.3 tika==2.6.0 -tiktoken==0.5.2 -tomlkit==0.12.3 -tqdm==4.66.1 +tiktoken==0.6.0 +tqdm==4.66.2 Twisted==23.10.0 txaio==23.1.1 typer==0.9.0 typing_extensions==4.9.0 -tzdata==2023.4 +tzdata==2024.1 uritemplate==4.1.1 -urllib3==2.2.0 +urllib3==2.2.1 vine==5.1.0 wasabi==1.1.2 wcwidth==0.2.13 weasel==0.3.4 whitenoise==6.6.0 xmltodict==0.13.0 -zope.interface==6.1 +zope.interface==6.2 diff --git a/swirl/connectors/__init__.py b/swirl/connectors/__init__.py index 5d26c5bd9..51c367d9e 100644 --- a/swirl/connectors/__init__.py +++ b/swirl/connectors/__init__.py @@ -16,7 +16,6 @@ from swirl.connectors.microsoft_graph import M365SharePointSites from swirl.connectors.microsoft_graph import MicrosoftTeams from swirl.connectors.mongodb import MongoDB -from swirl.connectors.snowflake import Snowflake from swirl.connectors.oracle import Oracle # uncomment the line below to enable PostgreSQL diff --git a/swirl/connectors/db_connector.py b/swirl/connectors/db_connector.py index 990ebaeba..98625139b 100644 --- a/swirl/connectors/db_connector.py +++ b/swirl/connectors/db_connector.py @@ -120,7 +120,7 @@ def validate_query(self, session=None): return False return True - + ######################################## def normalize_response(self): @@ -136,7 +136,7 @@ def normalize_response(self): if not self.response: # assume the connector took care of it return - + rows = self.response trimmed_rows = [] @@ -146,7 +146,7 @@ def normalize_response(self): n_field = 0 if self.column_names: for field in column_names: - # to handle None columns e.g. Snowflake + # to handle None columns if row[n_field]: dict_row[field] = row[n_field] else: @@ -168,4 +168,3 @@ def normalize_response(self): self.retrieved = retrieved self.results = trimmed_rows return - diff --git a/swirl/connectors/snowflake.py b/swirl/connectors/snowflake.py deleted file mode 100644 index 9515eec55..000000000 --- a/swirl/connectors/snowflake.py +++ /dev/null @@ -1,92 +0,0 @@ -''' -@author: Sid Probstein -@contact: sid@swirl.today -''' - -from sys import path -from os import environ - -import snowflake.connector -from snowflake.connector import ProgrammingError - -import json - -import django - -from swirl.utils import swirl_setdir -path.append(swirl_setdir()) # path to settings.py file -environ.setdefault('DJANGO_SETTINGS_MODULE', 'swirl_server.settings') -django.setup() - -from celery.utils.log import get_task_logger -from logging import DEBUG -logger = get_task_logger(__name__) -# logger.setLevel(DEBUG) - -from swirl.connectors.db_connector import DBConnector -from swirl.connectors.utils import bind_query_mappings - -######################################## -######################################## - -class Snowflake(DBConnector): - - type = "Snowflake" - - ######################################## - - def execute_search(self, session=None): - - logger.debug(f"{self}: execute_search()") - - if self.provider.credentials: - if ':' in self.provider.credentials: - credlist = self.provider.credentials.split(':') - if len(credlist) == 4: - username = credlist[0] - password = credlist[1] - database = credlist[2] - warehouse = credlist[3] - else: - self.warning("Invalid credentials, should be: username:password:database:warehouse") - else: - self.warning("No credentials!") - account = self.provider.url - - try: - # Create a new connection - conn = snowflake.connector.connect(user=username, password=password, account=account) - cursor = conn.cursor() - cursor.execute(f"USE WAREHOUSE {warehouse}") - cursor.execute(f"USE DATABASE {database}") - - cursor.execute(self.count_query) - count_result = cursor.fetchone() - found = count_result[0] if count_result else 0 - if found == 0: - self.message(f"Retrieved 0 of 0 results from: {self.provider.name}") - self.status = 'READY' - self.found = 0 - self.retrieved = 0 - return - - cursor.execute(self.query_to_provider) - self.column_names = [col[0].lower() for col in cursor.description] - results = cursor.fetchall() - - except ProgrammingError as err: - self.error(f"{err} querying {self.type}") - self.status = 'ERR' - cursor.close() - conn.close() - return - - self.response = list(results) - - cursor.close() - conn.close() - - self.found = found - self.retrieved = self.provider.results_per_query - return - diff --git a/swirl/models.py b/swirl/models.py index a5576d9b0..94d756d85 100644 --- a/swirl/models.py +++ b/swirl/models.py @@ -82,8 +82,7 @@ class SearchProvider(models.Model): ('M365SharePointSites', 'M365 SharePoint Sites'), ('MicrosoftTeams', 'Microsoft Teams'), ('MongoDB', 'MongoDB'), - ('Oracle','Oracle'), - ('Snowflake','Snowflake') + ('Oracle','Oracle') ] connector = models.CharField(max_length=200, default='RequestsGet', choices=CONNECTOR_CHOICES) url = models.CharField(max_length=2048, default=str, blank=True) From a45d87808380ea043bcbe02265b9b6098bfd71bf Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sun, 25 Feb 2024 13:17:23 -0500 Subject: [PATCH 08/44] updates to main branch automated testing workflows to use new docker image, etc. --- ...api-tests.yml => integrated-api-tests.yml} | 26 +++++----- .github/workflows/regression-tests.yml | 52 ------------------- .github/workflows/smoke-tests.yml | 26 +++++----- .github/workflows/unit-tests.yml | 20 +++---- 4 files changed, 36 insertions(+), 88 deletions(-) rename .github/workflows/{integration-api-tests.yml => integrated-api-tests.yml} (78%) delete mode 100644 .github/workflows/regression-tests.yml diff --git a/.github/workflows/integration-api-tests.yml b/.github/workflows/integrated-api-tests.yml similarity index 78% rename from .github/workflows/integration-api-tests.yml rename to .github/workflows/integrated-api-tests.yml index 25188ceb4..358c4e9f3 100644 --- a/.github/workflows/integration-api-tests.yml +++ b/.github/workflows/integrated-api-tests.yml @@ -1,8 +1,8 @@ -name: IntegrationAPITests +name: Integrated API Tests on: workflow_run: - workflows: [SmokeTests] + workflows: [Smoke Tests] types: - completed # Allows you to run this workflow manually from the Actions tab @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Download branch and run_id artifacts + - name: Download Branch and run_id Artifacts uses: dawidd6/action-download-artifact@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} @@ -22,7 +22,7 @@ jobs: name: branch-info-${{ github.event_name == 'workflow_run' && github.event.workflow_run.id || github.run_id }} path: ./artifacts continue-on-error: true # Allow the step to fail without stopping the workflow - - name: Determine branch for checkout + - name: Determine Branch for Checkout id: determine_branch run: | if [[ -f ./artifacts/branch.txt && -f ./artifacts/run_id.txt ]]; then @@ -32,23 +32,23 @@ jobs: BRANCH_NAME=$(echo $GITHUB_REF | cut -d "/" -f 3) echo "branch=$BRANCH_NAME" >> $GITHUB_ENV fi - - name: Print branch to be checked out + - name: Print Branch to be Checked Out run: | echo "Branch to checkout: ${{ env.branch }}" - - name: Checkout the code + - name: Checkout the Code uses: actions/checkout@v4 with: ref: ${{ env.branch }} - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.12.2' cache: 'pip' # caching pip stuff - name: Install Swirl run: ./install.sh - name: Update apt run: sudo apt -o Acquire::Retries=3 update - - name: Upgrade Ubuntu to latest patches + - name: Upgrade Ubuntu to Latest Patches run: sudo apt upgrade -y - name: Install redis-server run: sudo apt install -y redis-server @@ -56,21 +56,21 @@ jobs: run: python swirl.py setup - name: Start Swirl run: python swirl.py start - - name: Run integrated API tests - run: docker run --net=host -t swirlai/swirl-testing:latest-integrated-api sh -c "behave --tags=integrated_api" - - name: Ensure artifacts directory exists and write branch and run_id again + - name: Run Integrated API Tests + run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=integrated_api" + - name: Ensure Artifacts Directory Exists and WriteBbranch and run_id Again run: | mkdir -p ./artifacts echo "${{ env.branch }}" > ./artifacts/branch.txt echo "${{ env.original_run_id }}" > ./artifacts/run_id.txt - - name: Re-upload branch and run_id for subsequent workflows + - name: Re-upload Branch and run_id for Subsequent Workflows uses: actions/upload-artifact@v4 with: name: branch-info-${{ github.run_id }} path: | ./artifacts/branch.txt ./artifacts/run_id.txt - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/regression-tests.yml b/.github/workflows/regression-tests.yml deleted file mode 100644 index c6ee87b43..000000000 --- a/.github/workflows/regression-tests.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: RegressionTests - -on: - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -jobs: - - build: - runs-on: ubuntu-latest - - steps: - - name: Checkout the code - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: 'pip' # caching pip stuff - - name: Install Swirl - run: ./install.sh - - name: Update apt - run: sudo apt -o Acquire::Retries=3 update - - name: Upgrade Ubuntu to latest patches - run: sudo apt upgrade -y - - name: Install redis-server - run: sudo apt install -y redis-server - - name: Set up Swirl - run: python swirl.py setup - - name: Start Swirl - run: python swirl.py start - - - name: Checkout the QA Repo - uses: actions/checkout@v4 - with: - repository: 'swirlai/swirl-search-qa' - token: ${{ secrets.QA_TEST_REPO_TOKEN }} # Make sure this token has the correct permissions - path: swirl-search-qa - - name: Run Regression Tests - working-directory: ./path/to/directory - run: | - pip install -r --no-cache-dir requirements.txt - behave --no-capture --verbose - - - name: Upload log files - if: always() - uses: actions/upload-artifact@v4 - with: - name: log-files - path: | - logs/ - /var/log/syslog* diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml index bc5411dc6..efaa66260 100644 --- a/.github/workflows/smoke-tests.yml +++ b/.github/workflows/smoke-tests.yml @@ -1,8 +1,8 @@ -name: SmokeTests +name: Smoke Tests on: workflow_run: - workflows: [UnitTests] + workflows: [Unit Tests] types: - completed # Allows you to run this workflow manually from the Actions tab @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Download branch and run_id artifacts + - name: Download Branch and run_id Artifacts uses: dawidd6/action-download-artifact@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} @@ -23,7 +23,7 @@ jobs: name: branch-info-${{ github.event_name == 'workflow_run' && github.event.workflow_run.id || github.run_id }} path: ./artifacts continue-on-error: true # Allow the step to fail without stopping the workflow - - name: Determine branch for checkout + - name: Determine Branch for Checkout id: determine_branch run: | if [[ -f ./artifacts/branch.txt && -f ./artifacts/run_id.txt ]]; then @@ -33,23 +33,23 @@ jobs: BRANCH_NAME=$(echo $GITHUB_REF | cut -d "/" -f 3) echo "branch=$BRANCH_NAME" >> $GITHUB_ENV fi - - name: Print branch to be checked out + - name: Print Branch to be Checked Out run: | echo "Branch to checkout: ${{ env.branch }}" - - name: Checkout the code + - name: Checkout the Code uses: actions/checkout@v4 with: ref: ${{ env.branch }} - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.12.2' cache: 'pip' # caching pip stuff - name: Install Swirl run: ./install.sh - name: Update apt run: sudo apt -o Acquire::Retries=3 update - - name: Upgrade Ubuntu to latest patches + - name: Upgrade Ubuntu to Latest Patches run: sudo apt upgrade -y - name: Install redis-server run: sudo apt install -y redis-server @@ -57,21 +57,21 @@ jobs: run: python swirl.py setup - name: Start Swirl run: python swirl.py start - - name: Run smoke tests - run: docker run --net=host -t swirlai/swirl-testing:latest-smoke-test sh -c "behave **/docker_container/*.feature --tags=docker_api_smoke" - - name: Ensure artifacts directory exists and write branch and run_id again + - name: Run Smoke Tests + run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=docker_api_smoke" + - name: Ensure Artifacts Directory Exists and Write Branch and run_id Again run: | mkdir -p ./artifacts echo "${{ env.branch }}" > ./artifacts/branch.txt echo "${{ env.original_run_id }}" > ./artifacts/run_id.txt - - name: Re-upload branch and run_id for subsequent workflows + - name: Re-upload Branch and run_id for Subsequent Workflows uses: actions/upload-artifact@v4 with: name: branch-info-${{ github.run_id }} path: | ./artifacts/branch.txt ./artifacts/run_id.txt - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 856e63524..124292cd3 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,4 +1,4 @@ -name: UnitTests +name: Unit Tests on: push: @@ -19,38 +19,38 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout the code + - name: Checkout the Code uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.12.2' cache: 'pip' # caching pip stuff - name: Install Swirl run: ./install.sh - - name: Install pytest unit tests + - name: Install pytest Unit Tests run: ./install-test.sh - - name: Run pytest unit tests + - name: Run pytest Unit Tests run: pytest - - name: Create artifacts directory + - name: Create Artifacts Directory run: mkdir -p artifacts - - name: Set branch name + - name: Set Branch Name id: extract_branch run: | BRANCH_NAME=$(echo $GITHUB_REF | cut -d "/" -f 3) echo "branch=$BRANCH_NAME" >> $GITHUB_ENV - - name: Write branch and run_id to file + - name: Write Branch and run_id to File run: | echo "${{ env.branch }}" > ./artifacts/branch.txt echo "${{ github.run_id }}" > ./artifacts/run_id.txt - - name: Upload branch and run_id files as artifact + - name: Upload Branch and run_id Files as Artifacts uses: actions/upload-artifact@v4 with: name: branch-info-${{ github.run_id }} path: | ./artifacts/branch.txt ./artifacts/run_id.txt - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: From 88b0139ad6f8dcc1960fa9e8a7c619c583ed0446 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sun, 25 Feb 2024 14:35:35 -0500 Subject: [PATCH 09/44] add docker login step to update smoke and integrated test workflows on main --- .github/workflows/integrated-api-tests.yml | 5 +++++ .github/workflows/smoke-tests.yml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/integrated-api-tests.yml b/.github/workflows/integrated-api-tests.yml index 358c4e9f3..494b119e8 100644 --- a/.github/workflows/integrated-api-tests.yml +++ b/.github/workflows/integrated-api-tests.yml @@ -56,6 +56,11 @@ jobs: run: python swirl.py setup - name: Start Swirl run: python swirl.py start + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} - name: Run Integrated API Tests run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=integrated_api" - name: Ensure Artifacts Directory Exists and WriteBbranch and run_id Again diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml index efaa66260..e9b9cc9dd 100644 --- a/.github/workflows/smoke-tests.yml +++ b/.github/workflows/smoke-tests.yml @@ -57,6 +57,11 @@ jobs: run: python swirl.py setup - name: Start Swirl run: python swirl.py start + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} - name: Run Smoke Tests run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=docker_api_smoke" - name: Ensure Artifacts Directory Exists and Write Branch and run_id Again From 65b0fef8cd5212b4b2074a0733c7314d5e29655e Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Mon, 26 Feb 2024 20:00:21 -0500 Subject: [PATCH 10/44] switch main to new QA Suite testing workflow; update and clean up other workflow files --- .../docker-image-spg-experimental.yml | 4 +- .../workflows/docker-image-spg-preview.yml | 4 +- .github/workflows/docker-image-spg.yml | 6 +- .github/workflows/docker-image.yml | 10 +-- .github/workflows/integrated-api-tests.yml | 85 ------------------- .github/workflows/pages.yml | 10 +-- .../{smoke-tests.yml => qa-suite.yml} | 10 +-- .github/workflows/sectest-docker-image.yml | 6 +- .github/workflows/spell-checker.yml | 2 +- .github/workflows/unit-tests.yml | 2 +- .github/workflows/urls-checker.yml | 4 +- 11 files changed, 28 insertions(+), 115 deletions(-) delete mode 100644 .github/workflows/integrated-api-tests.yml rename .github/workflows/{smoke-tests.yml => qa-suite.yml} (95%) diff --git a/.github/workflows/docker-image-spg-experimental.yml b/.github/workflows/docker-image-spg-experimental.yml index 8b7abd157..718507199 100644 --- a/.github/workflows/docker-image-spg-experimental.yml +++ b/.github/workflows/docker-image-spg-experimental.yml @@ -1,4 +1,4 @@ -name: EXPERIMENT SpyglassDockerBuild +name: Experimental Spyglass Docker Build on: # Allows you to run this workflow manually from the Actions tab @@ -38,7 +38,7 @@ jobs: run: docker build --no-cache -t swirlai/spyglass:fork-x -f Dockerfile.fork.spg . - name: Push the Docker Image run: docker push swirlai/spyglass:fork-x - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/docker-image-spg-preview.yml b/.github/workflows/docker-image-spg-preview.yml index 90d727a55..0dda2030f 100644 --- a/.github/workflows/docker-image-spg-preview.yml +++ b/.github/workflows/docker-image-spg-preview.yml @@ -1,4 +1,4 @@ -name: PREVIEW Spyglass Docker Build +name: Preview Spyglass Docker Build on: # Allows you to run this workflow manually from the Actions tab @@ -37,7 +37,7 @@ jobs: run: docker build --no-cache -t swirlai/spyglass:preview -f Dockerfile.develop.spg . - name: Push the Docker Image run: docker push swirlai/spyglass:preview - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/docker-image-spg.yml b/.github/workflows/docker-image-spg.yml index 10d23852b..6c6dd71e3 100644 --- a/.github/workflows/docker-image-spg.yml +++ b/.github/workflows/docker-image-spg.yml @@ -1,4 +1,4 @@ -name: LatestSpyglassDockerBuild +name: Latest Spyglass Docker Build on: # Allow manual run of this workflow from the Actions tab @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout the code + - name: Checkout the Code uses: actions/checkout@v4 - name: Login to Docker Hub uses: docker/login-action@v3 @@ -37,7 +37,7 @@ jobs: run: docker build --no-cache -t swirlai/spyglass:latest -f Dockerfile.spg . - name: Push the Docker Image run: docker push swirlai/spyglass - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 2e1c289a1..82cf6e6dc 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -1,4 +1,4 @@ -name: DockerBuild +name: Docker Build on: # Allows manual run of this workflow from the Actions tab (on any branch) @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout the code + - name: Checkout the Code uses: actions/checkout@v4 - name: Login to Docker Hub uses: docker/login-action@v3 @@ -34,19 +34,17 @@ jobs: docker system prune -af docker volume prune -f docker builder prune -f - - name: Pull Latest UI Image - run: docker pull swirlai/spyglass:latest - name: Builder Bootstrap run: docker buildx create --name devBuilder --use --bootstrap - name: Build the Docker Image run: docker buildx build -t swirlai/swirl-search:latest --platform linux/amd64,linux/arm64 --push . - - name: Update repo description + - name: Update the Docker Repo Description uses: peter-evans/dockerhub-description@v4 with: username: ${{ secrets.DOCKER_USERNAME_X }} password: ${{ secrets.DOCKER_PASSWORD_X }} repository: swirlai/swirl-search - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/integrated-api-tests.yml b/.github/workflows/integrated-api-tests.yml deleted file mode 100644 index 494b119e8..000000000 --- a/.github/workflows/integrated-api-tests.yml +++ /dev/null @@ -1,85 +0,0 @@ -name: Integrated API Tests - -on: - workflow_run: - workflows: [Smoke Tests] - types: - - completed - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -jobs: - build: - if: (github.event_name == 'workflow_dispatch') || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') - runs-on: ubuntu-latest - - steps: - - name: Download Branch and run_id Artifacts - uses: dawidd6/action-download-artifact@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - workflow: smoke-tests.yml - name: branch-info-${{ github.event_name == 'workflow_run' && github.event.workflow_run.id || github.run_id }} - path: ./artifacts - continue-on-error: true # Allow the step to fail without stopping the workflow - - name: Determine Branch for Checkout - id: determine_branch - run: | - if [[ -f ./artifacts/branch.txt && -f ./artifacts/run_id.txt ]]; then - echo "branch=$(cat ./artifacts/branch.txt)" >> $GITHUB_ENV - echo "original_run_id=$(cat ./artifacts/run_id.txt)" >> $GITHUB_ENV - else - BRANCH_NAME=$(echo $GITHUB_REF | cut -d "/" -f 3) - echo "branch=$BRANCH_NAME" >> $GITHUB_ENV - fi - - name: Print Branch to be Checked Out - run: | - echo "Branch to checkout: ${{ env.branch }}" - - name: Checkout the Code - uses: actions/checkout@v4 - with: - ref: ${{ env.branch }} - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12.2' - cache: 'pip' # caching pip stuff - - name: Install Swirl - run: ./install.sh - - name: Update apt - run: sudo apt -o Acquire::Retries=3 update - - name: Upgrade Ubuntu to Latest Patches - run: sudo apt upgrade -y - - name: Install redis-server - run: sudo apt install -y redis-server - - name: Set up Swirl - run: python swirl.py setup - - name: Start Swirl - run: python swirl.py start - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - name: Run Integrated API Tests - run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=integrated_api" - - name: Ensure Artifacts Directory Exists and WriteBbranch and run_id Again - run: | - mkdir -p ./artifacts - echo "${{ env.branch }}" > ./artifacts/branch.txt - echo "${{ env.original_run_id }}" > ./artifacts/run_id.txt - - name: Re-upload Branch and run_id for Subsequent Workflows - uses: actions/upload-artifact@v4 - with: - name: branch-info-${{ github.run_id }} - path: | - ./artifacts/branch.txt - ./artifacts/run_id.txt - - name: Upload Log Files - if: always() - uses: actions/upload-artifact@v4 - with: - name: log-files - path: | - logs/ - /var/log/syslog* diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 0a4983451..defa53e0b 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -4,7 +4,7 @@ # documentation. # Sample workflow for building and deploying a Jekyll site to GitHub Pages -name: DeployDocsSite +name: Deploy Documentation # Modified trigger to only start this workflow on docs dir changes on: @@ -36,9 +36,9 @@ jobs: run: working-directory: docs # Added working-dir spec for docs dir steps: - - name: Checkout + - name: Checkout the Code uses: actions/checkout@v4 - - name: Setup Ruby + - name: Set Up Ruby uses: ruby/setup-ruby@v1 with: ruby-version: '3.3' # Not needed with a .ruby-version file @@ -46,7 +46,7 @@ jobs: cache-version: 0 # Increment this number if you need to re-download cached gems working-directory: 'docs' # Added working-dir param here after moving Ruby files to docs/ dir - - name: Setup Pages + - name: Set Up Pages id: pages uses: actions/configure-pages@v4 - name: Build with Jekyll @@ -54,7 +54,7 @@ jobs: run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" env: JEKYLL_ENV: production - - name: Upload artifact + - name: Upload Artifacts # Automatically uploads an artifact from the './_site' directory by default uses: actions/upload-pages-artifact@v3 with: diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/qa-suite.yml similarity index 95% rename from .github/workflows/smoke-tests.yml rename to .github/workflows/qa-suite.yml index e9b9cc9dd..23acc9cda 100644 --- a/.github/workflows/smoke-tests.yml +++ b/.github/workflows/qa-suite.yml @@ -1,4 +1,4 @@ -name: Smoke Tests +name: QA Suite on: workflow_run: @@ -40,7 +40,7 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ env.branch }} - - name: Set up Python + - name: Set Up Python uses: actions/setup-python@v5 with: python-version: '3.12.2' @@ -53,7 +53,7 @@ jobs: run: sudo apt upgrade -y - name: Install redis-server run: sudo apt install -y redis-server - - name: Set up Swirl + - name: Set Up Swirl run: python swirl.py setup - name: Start Swirl run: python swirl.py start @@ -62,8 +62,8 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - name: Run Smoke Tests - run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=docker_api_smoke" + - name: Run the QA Suite + run: docker run --net=host -t swirlai/swirl-search-qa:automated-tests sh -c "behave --tags=qa_suite" - name: Ensure Artifacts Directory Exists and Write Branch and run_id Again run: | mkdir -p ./artifacts diff --git a/.github/workflows/sectest-docker-image.yml b/.github/workflows/sectest-docker-image.yml index 4d2ff8536..b141902b3 100644 --- a/.github/workflows/sectest-docker-image.yml +++ b/.github/workflows/sectest-docker-image.yml @@ -1,4 +1,4 @@ -name: SecurityTestingBuild +name: Security Testing Build # Build a multi-arch docker image for testing security updates to Swirl @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout the code + - name: Checkout the Code uses: actions/checkout@v4 - name: Login to Docker Hub uses: docker/login-action@v3 @@ -39,7 +39,7 @@ jobs: run: docker buildx create --name devBuilder --use --bootstrap - name: Build the Docker Image run: docker buildx build -t swirlai/swirl-search:security-testing --platform linux/amd64,linux/arm64 --push . - - name: Upload log files + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/spell-checker.yml b/.github/workflows/spell-checker.yml index 803703024..59aca2409 100644 --- a/.github/workflows/spell-checker.yml +++ b/.github/workflows/spell-checker.yml @@ -1,4 +1,4 @@ -name: CheckSpelling +name: Check Spelling # Trigger to only run this workflow automatically on docs/ directory changes on: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 124292cd3..7a1d95e05 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -21,7 +21,7 @@ jobs: steps: - name: Checkout the Code uses: actions/checkout@v4 - - name: Set up Python + - name: Set Up Python uses: actions/setup-python@v5 with: python-version: '3.12.2' diff --git a/.github/workflows/urls-checker.yml b/.github/workflows/urls-checker.yml index 8d5ba78ab..2ae94edcc 100644 --- a/.github/workflows/urls-checker.yml +++ b/.github/workflows/urls-checker.yml @@ -1,4 +1,4 @@ -name: CheckURLs +name: Check URLs # Trigger to only run this workflow automatically on docs/ directory changes on: @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v4 - - name: URLs-Checker + - name: URLs Checker uses: urlstechie/urlchecker-action@0.0.34 # From here: https://github.com/urlstechie/urlchecker-action with: # A comma-separated list of file types to cover in the URL checks From 276504c6313e69a4d04872bad9e3f321e39c2bf4 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Tue, 27 Feb 2024 17:16:13 -0500 Subject: [PATCH 11/44] a new x branch --- Dockerfile.fork.spg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.fork.spg b/Dockerfile.fork.spg index 116b7c197..a16044541 100644 --- a/Dockerfile.fork.spg +++ b/Dockerfile.fork.spg @@ -3,7 +3,7 @@ FROM node:14.17.5 AS builder RUN git clone https://bitbucket.org/swirl-spyglass/spyglass.git /usr/src/spyglass WORKDIR /usr/src/spyglass/ui -RUN git checkout test-sockets +RUN git checkout chart-feature RUN git pull RUN npm install -g npm@7.21.1 RUN npm install From acadacfa3e092133ce8d16f7b477bb0ef57c8f01 Mon Sep 17 00:00:00 2001 From: Erik Spears <98238295+erikspears@users.noreply.github.com> Date: Tue, 27 Feb 2024 17:27:51 -0500 Subject: [PATCH 12/44] Revert "Bump cryptography and Django, remove snowflake." --- .github/workflows/urls-checker.yml | 2 +- SearchProviders/company_snowflake.json | 30 +++++ SearchProviders/preloaded.json | 30 +++++ docs/Developer-Reference.md | 165 +++++++++++++++---------- docs/User-Guide.md | 65 +++++----- requirements.txt | 65 +++++----- swirl/connectors/__init__.py | 1 + swirl/connectors/db_connector.py | 7 +- swirl/connectors/snowflake.py | 92 ++++++++++++++ swirl/models.py | 3 +- 10 files changed, 332 insertions(+), 128 deletions(-) create mode 100644 SearchProviders/company_snowflake.json create mode 100644 swirl/connectors/snowflake.py diff --git a/.github/workflows/urls-checker.yml b/.github/workflows/urls-checker.yml index 2ae94edcc..850335787 100644 --- a/.github/workflows/urls-checker.yml +++ b/.github/workflows/urls-checker.yml @@ -37,7 +37,7 @@ jobs: retry_count: 3 # A comma separated patterns to exclude during URL checks - exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,graph.microsoft.com,login.microsoftonline.com,my-host.com + exclude_patterns: localhost,api,apis,rss,etc,xx,googleapis,hostname,snowflake,graph.microsoft.com,login.microsoftonline.com,my-host.com # Exclude these files from the checker exclude_files: Swirl.postman_collection.json,docs/googlec95caf0bd4a8c5df.html,docs/Gemfile,docs/Gemfile.lock,docs/_config.yml,tests/,SearchProviders/ diff --git a/SearchProviders/company_snowflake.json b/SearchProviders/company_snowflake.json new file mode 100644 index 000000000..804ce3016 --- /dev/null +++ b/SearchProviders/company_snowflake.json @@ -0,0 +1,30 @@ +{ + "name": "Free Company Records - Snowflake", + "active": false, + "default": false, + "authenticator": "", + "connector": "Snowflake", + "url": "", + "query_template": "SELECT {fields} FROM {table} WHERE {field1} ILIKE '%{query_string}%' AND NULLIF(TRIM(founded), '') IS NOT NULL ORDER BY TRY_TO_NUMBER(REGEXP_REPLACE(SPLIT_PART(size, '-', 1), '[^0-9]', '')) DESC;", + "post_query_template": {}, + "http_request_headers": {}, + "page_fetch_config_json": {}, + "query_processors": [ + "AdaptiveQueryProcessor" + ], + "query_mappings": "fields=*,sort_by_date=founded,table=FREECOMPANYDATASET,field1=name", + "result_grouping_field": "", + "result_processors": [ + "MappingResultProcessor", + "CosineRelevancyResultProcessor" + ], + "response_mappings": "", + "result_mappings": "title='{name} ({founded})',body='{name} was founded in {founded} in {country}. It has {size} employees and operates in the {industry} industry.',url='https://{linkedin_url}',date_published=founded,NO_PAYLOAD", + "results_per_query": 10, + "credentials": "::FREE_COMPANY_DATASET:COMPUTE_WH", + "eval_credentials": "", + "tags": [ + "Company", + "Snowflake" + ] +} \ No newline at end of file diff --git a/SearchProviders/preloaded.json b/SearchProviders/preloaded.json index ac8df5317..4392a1b1f 100644 --- a/SearchProviders/preloaded.json +++ b/SearchProviders/preloaded.json @@ -1390,6 +1390,36 @@ "MongoDB" ] }, + { + "name": "Free Company Records - Snowflake", + "active": false, + "default": false, + "authenticator": "", + "connector": "Snowflake", + "url": "", + "query_template": "SELECT {fields} FROM {table} WHERE {field1} ILIKE '%{query_string}%' AND NULLIF(TRIM(founded), '') IS NOT NULL ORDER BY TRY_TO_NUMBER(REGEXP_REPLACE(SPLIT_PART(size, '-', 1), '[^0-9]', '')) DESC;", + "post_query_template": {}, + "http_request_headers": {}, + "page_fetch_config_json": {}, + "query_processors": [ + "AdaptiveQueryProcessor" + ], + "query_mappings": "fields=*,sort_by_date=founded,table=FREECOMPANYDATASET,field1=name", + "result_grouping_field": "", + "result_processors": [ + "MappingResultProcessor", + "CosineRelevancyResultProcessor" + ], + "response_mappings": "", + "result_mappings": "title='{name} ({founded})',body='{name} was founded in {founded} in {country}. It has {size} employees and operates in the {industry} industry.',url='https://{linkedin_url}',date_published=founded,NO_PAYLOAD", + "results_per_query": 10, + "credentials": "::FREE_COMPANY_DATASET:COMPUTE_WH", + "eval_credentials": "", + "tags": [ + "Company", + "Snowflake" + ] + }, { "name": "Entities - LittleSis.org", "active": false, diff --git a/docs/Developer-Reference.md b/docs/Developer-Reference.md index c4c4785d4..b2dda6461 100644 --- a/docs/Developer-Reference.md +++ b/docs/Developer-Reference.md @@ -22,28 +22,28 @@ This guide is intended to provide developers with detailed reference information The following table describes in more detail all the steps in the federation process, with the associated `status` and other important state information. -| Action | Module | Status | Notes | -| ---------- | ---------- | ---------- | ---------- | +| Action | Module | Status | Notes | +| ---------- | ---------- | ---------- | ---------- | | Search object created | views.py SearchViewSet.list() | Search.status:
NEW_SEARCH
UPDATE_SEARCH | Required:
`Search.query_string` | | Pre-processing | search.py search() | Search.status:
PRE_PROCESSING | Checks permissions
Loads the Search object | | Pre-query processing | search.py search() | Search.status:
PRE_QUERY_PROCESSING | Processes `Search.query_string` and updates `Search.query_string_processed` | | Federation | search.py search() | Search.status:
FEDERATING
FEDERATING_WAIT_*
FULL_RESULTS | Creates one Connector for each SearchProvider in the Search | -| Connector Init | connectors/connector.py
connectors/db_connector.py | Connector.status:
INIT
READY | Loads the Search and SearchProvider | +| Connector Init | connectors/connector.py
connectors/db_connector.py | Connector.status:
INIT
READY | Loads the Search and SearchProvider | | Connector Federate | federate() | Connector.status:
FEDERATING | | | Connector Query Processing| process_query() | FEDERATING | Process `Search.query_string_processed` and store in `Connector.query_string_to_provider` | | Connector Construct Query | construct_query() | FEDERATING | Take `Connector.query_string_to_provider` and create `Connector.query_to_provider` | | Connector Validate Query | validate_query() | FEDERATING | Returns "False" if `Connector.query_to_provider` is empty | -| Connector Execute Search | execute_search () | FEDERATING | Connect to the SearchProvider
Execute the search using `Search.query_to_provider`
Store the response in `Connector.response` | +| Connector Execute Search | execute_search () | FEDERATING | Connect to the SearchProvider
Execute the search using `Search.query_to_provider`
Store the response in `Connector.response` | | Connector Normalize Response | normalize_response() | FEDERATING | Transform `Connector.response` into JSON list of dicts
Store it in `Connector.results` | | Connector Process Results | process_results() | Connector.status:
FEDERATING
READY | Process `Connector.results` | | Connector Save Results | save_results() | Connector.status:
READY | Returns "True" | -| Post-result processing | search.py search() | Search.status:
POST_RESULT_PROCESSING
FULL_RESULTS_READY
FULL_UPDATE_READY | Runs the `post_result_processors`
Updates Result objects | +| Post-result processing | search.py search() | Search.status:
POST_RESULT_PROCESSING
FULL_RESULTS_READY
FULL_UPDATE_READY | Runs the `post_result_processors`
Updates Result objects | # `Search.Status` ## Normal States -| Status | Meaning | +| Status | Meaning | | ---------- | ---------- | | NEW_SEARCH | The search object is to be executed immediately | | UPDATE_SEARCH | The search object is to be updated immediately | @@ -56,21 +56,21 @@ The following table describes in more detail all the steps in the federation pro | PARTIAL_RESULTS | Swirl has received results from some providers, but not all | | POST_RESULT_PROCESSING | Swirl is performing post-result processing | | PARTIAL_RESULTS_READY | Swirl has processed results from responding providers | -| PARTIAL_UPDATE_READY | Swirl has processed updated results from responding providers | -| FULL_RESULTS_READY | Swirl has processed results for all specified providers | -| FULL_UPDATE_READY | Swirl has processed updated results for all specified providers | +| PARTIAL_UPDATE_READY | Swirl has processed updated results from responding providers | +| FULL_RESULTS_READY | Swirl has processed results for all specified providers | +| FULL_UPDATE_READY | Swirl has processed updated results for all specified providers | ## Error States -| Status | Meaning | +| Status | Meaning | | ---------- | ---------- | | ERR_DUPLICATE_RESULT_OBJECTS | More than one Result object was found; [contact support](#support) for assistance. | -| ERR_NEED_PERMISSION | The Django User did not have sufficient permissions to perform the requested operation. More: [Permissioning Normal Users](Admin-Guide.md#permissioning-normal-users) | +| ERR_NEED_PERMISSION | The Django User did not have sufficient permissions to perform the requested operation. More: [Permissioning Normal Users](Admin-Guide.md#permissioning-normal-users) | | ERR_NO_ACTIVE_SEARCHPROVIDERS | Search failed because no specified SearchProviders were active | | ERR_NO_RESULTS | Swirl has not received results from any source | | ERR_NO_SEARCHPROVIDERS | Search failed because no SearchProviders were specified | | ERR_RESULT_NOT_FOUND | A Result object that was expected to be found, was not; [contact support](#support) for assistance. | -| ERR_RESULT_PROCESSING | An error occurred during Result processing - check the `logs/celery-worker.log` for details | +| ERR_RESULT_PROCESSING | An error occurred during Result processing - check the `logs/celery-worker.log` for details | | ERR_SUBSCRIBE_PERMISSIONS | The user who created the Search object lacks permission to enable `subscribe` mode | # Key Module List @@ -88,7 +88,7 @@ A SearchProvider defines some searchable source. It includes metadata identifyin ## Properties -| Property | Description | Default Value (`Example Value`) | +| Property | Description | Default Value (`Example Value`) | | ---------- | ---------- | ---------- | | id | Unique identifier for the SearchProvider | Automatic (`1`) | | name | Human-readable name for the source | "" (`"Enterprise Search PSE"`) | @@ -98,7 +98,7 @@ A SearchProvider defines some searchable source. It includes metadata identifyin | date_updated | The time and date at which the SearchProvdier was updated | Automatic (`2022-02-29T18:03:02.716456Z`) | | active | Boolean setting: if `true` the SearchProvider is used, if `false` it is ignored when federating | false (`true`) | | default | Boolean setting: if `true` the SearchProvider will be queried for searches that don't specify a `searchprovider_list`; if `false`, the SearchProvider must be specified in the `searchprovider_list` | false (`true`) | -| connector | Name of the Connector to use for this source | "" (`"RequestsGet"`) | +| connector | Name of the Connector to use for this source | "" (`"RequestsGet"`) | | url | The URL or other string including file path needed by the Connector for this source; not validated | "" (`"https://www.googleapis.com/customsearch/v1"`) | | query_template | A string with optional variables in form `{variable}`; the Connector will bind the `query_template` with required data including the `url` and `query_string`, as well as any `query_mappings` or `credentials`, at runtime. Note this format is not yet used by the [Sqlite3 Connector](#sqlite3). | "" (`"{url}?q={query_string}"`) | | post_query_template | For the RequestsPost Connector: valid JSON and a marker for the query text which then sent as the POST body | "" (`"query": "{query_string}","limit": "100"`) | @@ -132,27 +132,27 @@ The only required property is a `query_string` with the actual text to be search ## Properties -| Property | Description | Default Value (`Example Value`) | +| Property | Description | Default Value (`Example Value`) | | ---------- | ---------- | ---------- | | id | Unique identifier for the Search | Automatic (`search_id=1`) | | owner | The username of the Django user that owns the object | logged-in user (`admin`) | | date_created | The time and date at which the Search was created | Automatic (`2022-02-28T17:55:04.811262Z`) | | date_updated | The time and date at which the Search was updated | Automatic (`2022-02-28T17:55:07.811262Z`) | -| query_string | The query to be federated - the only required field! | "" (`knowledge management`) | +| query_string | The query to be federated - the only required field! | "" (`knowledge management`) | | query_string_processed | The Search query, modified by any pre-query processing | "" ("") | -| sort | The type of search to be run | relevancy (`date`) | -| results_requested | The number of results, overall, the user has requested | 10 (`25`) | +| sort | The type of search to be run | relevancy (`date`) | +| results_requested | The number of results, overall, the user has requested | 10 (`25`) | | searchprovider_list | A list of the SearchProviders to search for this query; an empty list, the default, searches all sources | [] (`[ "Enterprise Search Engines - Google PSE" ]`) | | subscribe | If `True`, Swirl will update this Search as per the Celery-Beats schedule | False (`True`) | | status | The execution status of this search (see below) | NEW_SEARCH (`FULL_RESULTS_READY`) | | pre_query_processors | A list of processors to apply to the query before federation starts | "" (`[ "SpellcheckQueryProcessor" ]`) | | post_result_processors | A list of result processors to apply to the results after federation is complete | "" (`[ "DedupeByFieldPostResultProcessor", "CosineRelevancyPostResultProcessor" ]`) | | result_url | Link to the initial Result object for the Search which uses the `RelevancyMixer` | Automatic (`"http://localhost:8000/swirl/results?search_id=17&result_mixer=RelevancyMixer"`) | -| new_result_url | Link to the updated Result object for the search which uses the `RelevancyNewItemsMixer` | Automatic (`"http://localhost:8000/swirl/results?search_id=17&result_mixer=RelevancyNewItemsMixer"`) | -| messages | Messages from SearchProviders | "" (`Retrieved 1 of 1 results from: Document DB Search`) | -| result_mixer | The name of the Mixer object (see below) to use for ordering results | RoundRobinMixer (`Stack2Mixer`) | -| retention | The retention setting for this object; `0` = retain indefinitely; see [Search Expiration Service](Admin-Guide.md#search-expiration-service) for details | 0 (`2` for daily deletion) | -| tags | Parameter (string) that can be passed into a search and will be attached to the Search object that is stored in Swirl | "" (`{ "query_string": "knowledge management", "tags": ["max_length:50"] }`) | +| new_result_url | Link to the updated Result object for the search which uses the `RelevancyNewItemsMixer` | Automatic (`"http://localhost:8000/swirl/results?search_id=17&result_mixer=RelevancyNewItemsMixer"`) | +| messages | Messages from SearchProviders | "" (`Retrieved 1 of 1 results from: Document DB Search`) | +| result_mixer | The name of the Mixer object (see below) to use for ordering results | RoundRobinMixer (`Stack2Mixer`) | +| retention | The retention setting for this object; `0` = retain indefinitely; see [Search Expiration Service](Admin-Guide.md#search-expiration-service) for details | 0 (`2` for daily deletion) | +| tags | Parameter (string) that can be passed into a search and will be attached to the Search object that is stored in Swirl | "" (`{ "query_string": "knowledge management", "tags": ["max_length:50"] }`) | {: .highlight } There are some special Search tags that control query processing. For example, the `SW_RESULT_PROCESSOR_SKIP` Search tag can be used to skip a processor for the Search it is specified for: `SW_RESULT_PROCESSOR_SKIP:DedupeByFieldResultProcessor` @@ -172,21 +172,21 @@ There are some special Search tags that control query processing. For example, A Result object is the normalized, re-ranked result for a single Search, from a single SearchProvider. They are created at the end of the federated search process in response to the creation of a Search object. They are the only Swirl object that has a foreign key (`search.id`). -Only Connectors should create Result objects. +Only Connectors should create Result objects. -Developers are free to operate on individual Results as needed for their application. +Developers are free to operate on individual Results as needed for their application. However, the [goal of Swirl](index.md) (and federated search in general) is to provide unified results from all sources. Swirl uses Mixers to make this quick and easy. ## Properties -| Property | Description | `Example Value` | -| ---------- | ---------- | ---------- | +| Property | Description | `Example Value` | +| ---------- | ---------- | ---------- | | id | Unique identifier for the Result | `1` | | owner | The username of the Django user that owns the object | `admin` | | date_created | The time and date at which the Result was created. | `2022-02-28T17:55:04.811262Z` | | date_updated | The time and date at which the Result was updated | `2022-02-28T19:55:02.752962Z` | -| search_id | The `id` of the associated Search; there may be many Result objects with this `id` | `18` | +| search_id | The `id` of the associated Search; there may be many Result objects with this `id` | `18` | | searchprovider | The name value of the SearchProvider that provided this result list | `"OneDrive Files - Microsoft 365"` | | query_to_provider | The exact query sent to the SearchProvider | `https://www.googleapis.com/customsearch/v1?cx=google-search-engine-id&key=google-json-api-key&q=strategy` | | query_processors | The names of the Processors, specified in the SearchProvider, that processed the query | `"AdaptiveQueryProcessor"` | @@ -197,7 +197,7 @@ However, the [goal of Swirl](index.md) (and federated search in general) is to p | retrieved | The number of results Swirl retrieved from this SearchProvider for this query | `10` | | found | The total number of results reported by the SearchProvider for this query | `2309` | | time | The time it took for the SearchProvider to create this result set, in seconds | `1.9` | -| json_results | The normalized JSON results from this SearchProvider | (*See below*) | +| json_results | The normalized JSON results from this SearchProvider | (*See below*) | ## `json_results` @@ -253,13 +253,14 @@ The following table describes the included source Connectors: | PostgreSQL | Searches PostgreSQL database | `url` (connection parameters), `query_template`, `credentials` | | RequestsGet | Searches any web endpoint using HTTP/GET with JSON response, including Google PSE, SOLR, Northern Light and more (see below) | `url`, `credentials` | | RequestsPost | Searches any web endpoint using HTTP/POST with JSON response, including M365 | `url`, `credentials` | +| Snowflake | Searches Snowflake datasets | `credentials`, `database`, `warehouse` | | Sqlite3 | Searches SQLite3 databases | `url` (database file path), `query_template` | -Connectors are specified in, and configured by, SearchProvider objects. +Connectors are specified in, and configured by, SearchProvider objects. ## BigQuery -The [BigQuery connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/bigquery.py) uses the Google Cloud Python package. +The [BigQuery connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/bigquery.py) uses the Google Cloud Python package. The included [BigQuery SearchProvider](https://github.com/swirlai/swirl-search/blob/main/SearchProviders/funding_db_bigquery.json) is intended for use with the [Funding Data Set](#funding-data-set) but can be adapted to most any configuration. @@ -275,7 +276,7 @@ The included [BigQuery SearchProvider](https://github.com/swirlai/swirl-search/b ], "query_mappings": "fields=*,sort_by_date=fundedDate,table=funding.funding,field1=company,field2=city", "result_processors": [ - "MappingResultProcessor", + "MappingResultProcessor", "CosineRelevancyResultProcessor" ], "result_mappings": "title='{company}',body='{company} raised ${raisedamt} series {round} on {fundeddate}. The company is located in {city} {state} and has {numemps} employees.',url=id,date_published=fundeddate,NO_PAYLOAD", @@ -358,7 +359,7 @@ The following three Tags are available for use in the ChatGPT SearchProvider to "CHAT_QUERY_REWRITE_GUIDE:You are a helpful assistant that responds like a pirate captain" ``` -* `CHAT_QUERY_DO_FILTER`: Turn on or off the default internal filter of ChatGPT responses. +* `CHAT_QUERY_DO_FILTER`: Turn on or off the default internal filter of ChatGPT responses. ``` json "CHAT_QUERY_DO_FILTER:false" ``` @@ -531,13 +532,13 @@ As of Swirl 3.1.0, the included [Free Public DB](https://github.com/swirlai/swir ## PostgreSQL -The [PostgreSQL connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/postgresql.py) uses the [psycopg2](https://pypi.org/project/psycopg2/) driver. +The [PostgreSQL connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/postgresql.py) uses the [psycopg2](https://pypi.org/project/psycopg2/) driver. ### Installing the PostgreSQL Driver To use PostgreSQL with Swirl: -* Install [PostgreSQL](https://www.postgresql.org/) +* Install [PostgreSQL](https://www.postgresql.org/) * Modify the system PATH so that `pg_config` from the PostgreSQL distribution runs from the command line * Install `psycopg2` using `pip`: @@ -866,6 +867,46 @@ The [RequestsPost connector](https://github.com/swirlai/swirl-search/blob/main/s [Contact Support](#support) for help getting started. +## Snowflake + +The [Snowflake connector](https://github.com/swirlai/swirl-search/blob/main/swirl/connectors/snowflake.py) uses the `snowflake-connector-python` package to connect to a Snowflake instance. + +The included [Free Company Records](https://github.com/swirlai/swirl-search/blob/main/SearchProviders/company_snowflake.json) SearchProvider is configured to search the `FreeCompanyResearch` dataset available in the Snowflake Marketplace. + +``` json +{ + "name": "Free Company Records - Snowflake", + "active": false, + "default": false, + "authenticator": "", + "connector": "Snowflake", + "url": "", + "query_template": "SELECT {fields} FROM {table} WHERE {field1} ILIKE '%{query_string}%' AND NULLIF(TRIM(founded), '') IS NOT NULL ORDER BY TRY_TO_NUMBER(REGEXP_REPLACE(SPLIT_PART(size, '-', 1), '[^0-9]', '')) DESC;", + "post_query_template": {}, + "http_request_headers": {}, + "page_fetch_config_json": {}, + "query_processors": [ + "AdaptiveQueryProcessor" + ], + "query_mappings": "fields=*,sort_by_date=founded,table=FREECOMPANYDATASET,field1=name", + "result_grouping_field": "", + "result_processors": [ + "MappingResultProcessor", + "CosineRelevancyResultProcessor" + ], + "response_mappings": "", + "result_mappings": "title='{name} ({founded})',body='{name} was founded in {founded} in {country}. It has {size} employees and operates in the {industry} industry.',url='https://{linkedin_url}',date_published=founded,NO_PAYLOAD", + "results_per_query": 10, + "credentials": "::FREE_COMPANY_DATASET:COMPUTE_WH", + "eval_credentials": "", + "tags": [ + "Company", + "Snowflake" + ] +} +``` + +Note: Putting a fixed SQL query in the `query_template` is perfectly acceptable. Anything that doesn't change in the URL can be stored here. ## SQLite3 @@ -926,18 +967,18 @@ This pipeline removes duplicates from the result set prior to relevancy ranking. Query Processors operate queries. The exact field they operate on depends on how they are deployed. | Pipeline | Reads | Updates | -| ---------- | ---------- | ---------- | -| Search.pre_query_processors | `Search.query_string` | `Search.query_string_processed` | -| SearchProvider.query_processors | `Search.query_string_processed` | `.query_string_to_provider` | +| ---------- | ---------- | ---------- | +| Search.pre_query_processors | `Search.query_string` | `Search.query_string_processed` | +| SearchProvider.query_processors | `Search.query_string_processed` | `.query_string_to_provider` | This table describes the query processors included in Swirl: -| Processor | Description | Notes | -| ---------- | ---------- | ---------- | +| Processor | Description | Notes | +| ---------- | ---------- | ---------- | | AdaptiveQueryProcessor | Rewrites queries based on the `query_mappings` for a given SearchProvider | Should not be used as `pre_query_processor` | | ChatGPTQueryProcessor | This query processor asks ChatGPT to rewrite queries based on a configurable prompt. For example it can rewrite queries to be fuzzier, broader, more specific, boolean, or in another language. | Experimental | | GenericQueryProcessor | Removes special characters from the query | | -| SpellcheckQueryProcessor | Uses [TextBlob](https://textblob.readthedocs.io/en/dev/quickstart.html#spelling-correction) to predict and fix spelling errors in `query_string` | Best deployed in a `SearchProvider.query_processor` for sources that need it; not recommended with Google PSEs | +| SpellcheckQueryProcessor | Uses [TextBlob](https://textblob.readthedocs.io/en/dev/quickstart.html#spelling-correction) to predict and fix spelling errors in `query_string` | Best deployed in a `SearchProvider.query_processor` for sources that need it; not recommended with Google PSEs | | NoModQueryProcessor | Only removes leading SearchProvider Tags and does not modify the query terms in any way. | It is intended for repositories that allow non-search characters (such as brackets). | ## Result Processors @@ -946,15 +987,15 @@ Result Processors transform source results into the Swirl format defined in [swi The following table lists the Result Processors included with Swirl: -| Processor | Description | Notes | -| ---------- | ---------- | ---------- | +| Processor | Description | Notes | +| ---------- | ---------- | ---------- | | GenericResultProcessor | Copies results from source format to Swirl format by exact match on name | Recommended for sources that don't need mapping | | MappingResultProcessor | Transforms results from source format to Swirl format, using `SearchProvider.result_mappings` | Default | | LenLimitingResultProcessor | Checks if the `title` and `body` responses from a source exceed a configurable length (set in `swirl_server/settings.py`: `SWIRL_MAX_FIELD_LEN = 512`), truncates anything after that value, and adds an ellipsis ("..."). If the `body` field has been truncated, the processor reports the entire response in a new `body_full` field in the Payload. The default truncation length for can be overridden for a specific SearchProvider using a new Tag value (e.g. `max_length:256`). | Recommended for sources that consistently return lengthy title or body fields; should follow the `MappingResultProcessor`. | | CleanTextResultProcessor | Removes non-alphanumeric characters from the source response. It should be considered for lengthy responses where URLs or other HTML or Markdown syntax appear in results. | Should be installed before the `LenLimitingResultProcessor` when both are used. | | DateFinderResultProcessor | Looks for a date in any a number of formats in the body field of each result item. Should it find one, and the `date_published` for that item is `'unknown'`, it replaces `date_published` with the date extracted from the body, and notes this in the `result.messages`. | This processor can detect the following date formats:
`06/01/23`
`06/01/2023`
`06-01-23`
`06-01-2023`
`jun 1, 2023`
`june 1, 2023` | -| AutomaticPayloadMapperResultProcessor | Profiles response data to find good strings for Swirl's `title`, `body`, and `date_published` fields. It is intended for SearchProviders that would otherwise have few (or no) good `result_mappings` options. | It should be place after the `MappingResultProcessor`. The `result_mappings` field should be blank, except for the optional DATASET directive, which will return only a single Swirl response for each provider response, with the original response in the `payload` field under the `dataset` key. | -| RequireQueryStringInTitleResultProcessor | Drops results that do not contain the `query_string_to_provider` in the result `title` field. | It should be added after the `MappingResultProcessor` and is now included by default in the "LinkedIn - Google PSE" SearchProvider. | +| AutomaticPayloadMapperResultProcessor | Profiles response data to find good strings for Swirl's `title`, `body`, and `date_published` fields. It is intended for SearchProviders that would otherwise have few (or no) good `result_mappings` options. | It should be place after the `MappingResultProcessor`. The `result_mappings` field should be blank, except for the optional DATASET directive, which will return only a single Swirl response for each provider response, with the original response in the `payload` field under the `dataset` key. | +| RequireQueryStringInTitleResultProcessor | Drops results that do not contain the `query_string_to_provider` in the result `title` field. | It should be added after the `MappingResultProcessor` and is now included by default in the "LinkedIn - Google PSE" SearchProvider. | ## Post Result Processors @@ -972,8 +1013,8 @@ The relevancy model is as follows: * Aggregates the similarity of: - * the entire query and the field, with the score being the highest in any single sentence (if any), - + * the entire query and the field, with the score being the highest in any single sentence (if any), + * the entire query and a window of text around the field match * the 1 and 2 word combinations in the query (if long enough), and a window of text around the field match @@ -988,7 +1029,7 @@ The relevancy model is as follows: * Normalizes the query executed by this SearchProvider vs. all the other queries in the set - this is reflected in the `query_length_adjust` in the `explain` structure. -The Swirl score is just that: a score. The higher a score is, the more contextually relevant the result is. Scores aren't comparable between queries or results. +The Swirl score is just that: a score. The higher a score is, the more contextually relevant the result is. Scores aren't comparable between queries or results. *Tip: to translate a result score to a confidence score, take the #1 result as 1.0, and then divide subsequent results by the score for that result to calculate the confidence.* @@ -1019,16 +1060,16 @@ The Galaxy UI will not display the correct number of results if this ResultProce The following table details the Result Mixers included with Swirl: | Mixer | Description | Notes | -| ---------- | ---------- | ---------- | +| ---------- | ---------- | ---------- | | RelevancyMixer | Organizes results by [relevancy](User-Guide.md#relevancy-ranking) score (descending), then source rank (ascending) | The default; depends on `relevancy_processor` being installed as the `search.post_result_processors` (also the default) | | RelevancyNewItemsMixer | Organizes results as above, but hiding results that don't have the `new` field as [created during Search updates](Developer-Guide.md#update-a-search) | This is the default for `search.new_result_url`| -| DateMixer | Organizes results by `date_published`. Results with "unknown" for `date_published` are omitted | Use when you want date sorted results | -| DateNewItemsMixer | Organizes results as above, but hiding results that don't have the `new` field as [created during Search updates](Developer-Guide.md#update-a-search) | This is the default for `search.new_result_url` when `search.date` is set to `sort` | -| RoundRobinMixer | Organizes results by taking 1 result from each responding SearchProvider, alternating; actually calls `Stack1Mixer` (see below) | Good for searches with `search.sort` set to "date" or anytime you want a cross-section of results instead of just the ones with the most evidence | +| DateMixer | Organizes results by `date_published`. Results with "unknown" for `date_published` are omitted | Use when you want date sorted results | +| DateNewItemsMixer | Organizes results as above, but hiding results that don't have the `new` field as [created during Search updates](Developer-Guide.md#update-a-search) | This is the default for `search.new_result_url` when `search.date` is set to `sort` | +| RoundRobinMixer | Organizes results by taking 1 result from each responding SearchProvider, alternating; actually calls `Stack1Mixer` (see below) | Good for searches with `search.sort` set to "date" or anytime you want a cross-section of results instead of just the ones with the most evidence | | Stack1Mixer | Organizes results by taking 1 result from each responding SearchProvider, alternating | Good for cross-sections of data | | Stack2Mixer | Organizes results by taking 2 from each responding SearchProvider, alternating | Good for cross-sections of data with 4-6 sources | -| Stack3Mixer | Organizes results by taking 3 from each responding SearchProvider, alternating | Good for cross-sections of data with few sources | -| StackNMixer | Organizes results by taking `N` from each responding source, where `N` if not specified is the number of results requested divided by the number of SearchProviders reporting at least 1 result | Good for cross-sections of data with few providers | +| Stack3Mixer | Organizes results by taking 3 from each responding SearchProvider, alternating | Good for cross-sections of data with few sources | +| StackNMixer | Organizes results by taking `N` from each responding source, where `N` if not specified is the number of results requested divided by the number of SearchProviders reporting at least 1 result | Good for cross-sections of data with few providers | ## Date Mixer @@ -1042,7 +1083,7 @@ For example: [http://localhost:8000/swirl/results?search_id=1&result_mixer=DateM ## NewItems Mixers -The two NewItems mixers automatically filter results to items with the `new` field present. Both will report the number of results hidden because they do not have this field. +The two NewItems mixers automatically filter results to items with the `new` field present. Both will report the number of results hidden because they do not have this field. To remove the `new` field from all results in a search, add `&mark_all_as_read=1` to the `result_mixer` URL property. For example: @@ -1060,7 +1101,7 @@ To invoke the mixer specified using the `result_mixer` property of the Search ob http://localhost:8000/swirl/results/?search_id=1 ``` -If you use the Swirl defaults, a search will produce a JSON result that is relevancy ranked. +If you use the Swirl defaults, a search will produce a JSON result that is relevancy ranked. To specify a different Mixer, add `&result_mixer=mixer-name` to the URL. @@ -1073,7 +1114,7 @@ The following table describes the Mixer wrapper in more detail: | Field | Description | | ---------- | ---------- | | messages | All messages from the Search and all SearchProviders | -| info | A dictionary of Found and Retrieved counts from each SearchProvider | +| info | A dictionary of Found and Retrieved counts from each SearchProvider | | info - search | Information about the Search, including the processed query, and links to re-run and re-score Searches | | info - results | Information about the Results, including the number retrieved and the URL of the next (and previous) pages of results | | results | Mixed Results from the specified Search | @@ -1082,10 +1123,10 @@ The following table describes the Mixer wrapper in more detail: ## Funding Data Set -The TechCrunch Continental USA funding data set was taken from [Insurity SpatialKey](https://support.spatialkey.com/spatialkey-sample-csv-data/). It is included with Swirl in [Data/funding_db.csv](https://github.com/swirlai/swirl-search/blob/main/Data/funding_db.csv) -This file was processed with [scripts/fix_csv.py](https://github.com/swirlai/swirl-search/blob/main/scripts/fix_csv.py) prior to loading into SQLite3. +The TechCrunch Continental USA funding data set was taken from [Insurity SpatialKey](https://support.spatialkey.com/spatialkey-sample-csv-data/). It is included with Swirl in [Data/funding_db.csv](https://github.com/swirlai/swirl-search/blob/main/Data/funding_db.csv) +This file was processed with [scripts/fix_csv.py](https://github.com/swirlai/swirl-search/blob/main/scripts/fix_csv.py) prior to loading into SQLite3. -### Loading into SQLite3 +### Loading into SQLite3 1. Activate [sqlite_web](Admin-Guide.md#sqlite-web) Then, from the swirl-home directory: @@ -1232,7 +1273,7 @@ Results should appear in the right-hand pane: "author" : "Phillip K Allen", "to" : "pallen70@hotmail.com", "subject" : "Investment Structure", - "content" : """---------------------- Forwarded by - + "content" : """---------------------- Forwarded by + ... ``` diff --git a/docs/User-Guide.md b/docs/User-Guide.md index 3748432f6..5f5fa4510 100644 --- a/docs/User-Guide.md +++ b/docs/User-Guide.md @@ -16,20 +16,20 @@ nav_order: 3 ## Intended Audience -This guide is intended for developers, data scientists, program managers, or anyone who wants to use Swirl, including searching and customizing SearchProviders. +This guide is intended for developers, data scientists, program managers, or anyone who wants to use Swirl, including searching and customizing SearchProviders. For background information on Swirl, please review the [Swirl Overview](index.md). # Terminology -| Word | Explanation | +| Word | Explanation | | ---------- | ---------- | | SearchProvider | An object defining a searchable source. It includes metadata identifying the type of connector used to search the source and more. | | Search | An object defining a query that a user or system desires to run. It includes the `query_string` with the actual text and metadata. Most of the metadata is optional.| | Query | Search engines distinguish between the act of searching and the terms used for searching, which are usually referred to as a query. Swirl follows this convention whenever possible but may refer to a search as a query at times. | | Subscribe | An important property of Search objects. When set to `true`, Swirl periodically reruns the search, specifying a date sort to get newer data, and removing duplicates from results.| | Connector | A Swirl module that can connect to, and query, a particular type of data source. Connectors are a wrapper around some existing Python package such as `request.get` or `elasticsearch`.| -| Relevancy Ranking | An estimation of the relative value of a given search engine result to the user's query, as compared to all others - to put it simply. For more information: [https://en.wikipedia.org/wiki/Relevance_(information_retrieval)](https://en.wikipedia.org/wiki/Relevance_(information_retrieval)) | +| Relevancy Ranking | An estimation of the relative value of a given search engine result to the user's query, as compared to all others - to put it simply. For more information: [https://en.wikipedia.org/wiki/Relevance_(information_retrieval)](https://en.wikipedia.org/wiki/Relevance_(information_retrieval)) | # Running a Search @@ -49,7 +49,7 @@ If the search page appears, click `Log Out` at the top, right. The Swirl login p ![Swirl Results Source Facet](images/swirl_results_source-galaxy_dark.png) -Swirl returns the best results from all available sources by default. To filter results by one or more sources, check one or more of the `Source` boxes as shown above. Results are instantly filtered to just those sources. +Swirl returns the best results from all available sources by default. To filter results by one or more sources, check one or more of the `Source` boxes as shown above. Results are instantly filtered to just those sources. Click `Clear All` to return to viewing all results. @@ -79,9 +79,9 @@ Click the Swirl logo (top left of the page) at any time to reset the Galaxy sear The following table summarizes the current Swirl search syntax options: -| Syntax | Handling | Notes | +| Syntax | Handling | Notes | | ---------- | ---------- | ---------- | -| AND, OR | Passed down to all SearchProviders | Swirl does not verify compliance | +| AND, OR | Passed down to all SearchProviders | Swirl does not verify compliance | | NOT, -term | Passed down to configured SearchProviders and rewritten if necessary; removed from the query for providers that don't support `NOT` or `-term` | Swirl verifies compliance; and also down-weights and flags responses that included NOT-ed terms | | tag:term | Passes `term` to the SearchProviders configured with it in their `tags` field. The untagged portion of the query is discarded. If `tag:` begins the query, then only providers with that Tag are searched. | Example: `electric vehicle company:tesla`
Only the term `tesla` will go to SearchProviders with the `company` Tag, so long as they are active.
Example: `company:facebook`
The query `facebook` will only go to SearchProviders with the `company` Tag. | @@ -212,16 +212,17 @@ Swirl includes five (5) Google Programmable Search Engines (PSEs) to get you up [SearchProvider Example JSON](https://github.com/swirlai/swirl-search/tree/main/SearchProviders) | SearchProvider | Description | Notes | -| ---------- | ---------- | ---------- | +| ---------- | ---------- | ---------- | | arxiv.json | Searches the [arXiv.org](https://arxiv.org/) repository of scientific papers | No authorization required | | asana.json | Searches Tasks in [Asana](https://asana.com/) | Requires an Asana personal access token | | atlassian.json | Searches Atlassian [Confluence Cloud](https://www.atlassian.com/software/confluence), [Jira Cloud](https://www.atlassian.com/software/jira), and [Trello](https://trello.com/) Cards. | Requires a bearer token and/or Trello API key; Confluence searches the [CQL `text~` content](https://developer.atlassian.com/server/confluence/performing-text-searches-using-cql/) and Jira searches the [JQL `text~` content](https://support.atlassian.com/jira-software-cloud/docs/what-is-advanced-searching-in-jira-cloud/#Advancedsearching-textPerformingtextsearches) | | blockchain-bitcoin.json | Searches [Blockchain.com](https://www.blockchain.com/) for specific Bitcoin Addresses (wallets) and Transactions IDs (hashes) | Requires a Blockchain.com API key | | chatgpt.json | ChatGPT AI chatbot | Requires an OpenAI API key | +| company_snowflake.json | Searches the [Snowflake](https://www.snowflake.com/en/) `FreeCompanyResearch` dataset | Requires a Snowflake username and password | | crunchbase.json | Searches organizations via the [Crunchbase](https://www.crunchbase.com/) basic API | Requires a Crunchbase.com API key | | document_db.json | SQLite3 document database | [documents_db.csv](https://github.com/swirlai/swirl-search/tree/main/Data/documents_db.csv) | | elastic_cloud.json | elasticsearch, cloud version | [Enron Email Dataset](Developer-Reference.md#enron-email-data-set) Requires cloud_id, credentials | -| elasticsearch.json | elasticsearch, local install | [Enron Email Dataset](Developer-Reference.md#enron-email-data-set) Requires host, port, credentials | +| elasticsearch.json | elasticsearch, local install | [Enron Email Dataset](Developer-Reference.md#enron-email-data-set) Requires host, port, credentials | | europe_pmc.json | Searches the [EuropePMC.org](https://europepmc.org/) repository of life-sciences literature | No authorization required | | funding_db_bigquery.json | BigQuery funding database | [Funding Dataset](Developer-Reference.md#funding-data-set) | | funding_db_postgres.json | PostgreSQL funding database | [Funding Dataset](Developer-Reference.md#funding-data-set) | @@ -230,17 +231,17 @@ Swirl includes five (5) Google Programmable Search Engines (PSEs) to get you up | google_news.json | Searches the [Google News](https://news.google.com/) feed | No authorization required | | google_pse.json | Five Google Programmable Search Engines (PSE) | Includes shared Swirl credentials; may return a 429 error if overused | | hacker_news.json | Queries a [searchable version](https://hn.algolia.com/) of the Hacker News feeds | No authorization required | -| http_get_with_auth.json | Generic HTTP GET query with basic authentication | Requires url, credentials | +| http_get_with_auth.json | Generic HTTP GET query with basic authentication | Requires url, credentials | | http_post_with_auth.json | Generic HTTP POST query with basic authentication | Requires url, credentials | -| hubspot.json | Searches the HubSpot CRM for Companies, Contacts, and Deals | Requires a bearer token | +| hubspot.json | Searches the HubSpot CRM for Companies, Contacts, and Deals | Requires a bearer token | | internet_archive.json | Searches the [Internet Archive Library](https://archive.org/) of items | No authorization required | -| littlesis.json | Searches the free [LittleSis.org](https://littlesis.org/) database of "who-knows-who at the heights of business and government" | No authorization required | +| littlesis.json | Searches the free [LittleSis.org](https://littlesis.org/) database of "who-knows-who at the heights of business and government" | No authorization required | | microsoft.json | Searches M365 Outlook Messages, Calendar Events, OneDrive Files, SharePoint Sites, and Teams Chat | See the [M365 Guide](M365-Guide.md) for details | | miro.json | [Miro.com](https://miro.com) drawing service | Requires a bearer token | | movies_mongodb.json | Searches the [Mongodb Atlas](https://www.mongodb.com/) `sample_mflix` collection, `movies` sample table | Requires database username and password, plus Atlas cluster URL | -| newsdata_io.json | Newsdata.io internet news source | Requires username and password
archive provider also included | +| newsdata_io.json | Newsdata.io internet news source | Requires username and password
archive provider also included | | nlresearch.json | NLResearch.com is a premium and internet content search engine from [Northern Light](https://northernlight.com/) | Requires username and password | -| open_sanctions.json | Searches the [OpenSanctions.org](https://www.opensanctions.org/) database of sanctions targets and persons of interest | Requires and OpenSanctions API key | +| open_sanctions.json | Searches the [OpenSanctions.org](https://www.opensanctions.org/) database of sanctions targets and persons of interest | Requires and OpenSanctions API key | | opensearch.json | OpenSearch 2.x | [Developer Guide](Developer-Reference.md#elastic--opensearch) | | oracle.json | Tested against [Oracle](https://www.oracle.com/) 23c Free (and presumably supporting earlier versions) | Requires Oracle username and password | | preloaded.json | All preloaded SearchProviders | Defaults in the Swirl distribution | @@ -276,7 +277,7 @@ Swirl includes five (5) Google Programmable Search Engines (PSEs) to get you up * A new Google PSE SearchProvider that targets the [new Swirl documentation website](https://docs.swirl.today/) is included and enabled by default. * The EuropePMC SearchProvider is preloaded, set to active status, and configured to participate in Retrieval Augmented Generation (RAG) by default. -* As of Release 3.1.0, Swirl includes SearchProviders for [Asana](https://asana.com/) Tasks, [Atlassian Trello](https://trello.com/) Cards, [Internet Archive Library](https://archive.org/) items, [Mongodb Atlas](https://www.mongodb.com/), [Oracle](https://www.oracle.com/) (WIP). +* As of Release 3.1.0, Swirl includes SearchProviders for [Asana](https://asana.com/) Tasks, [Atlassian Trello](https://trello.com/) Cards, [Internet Archive Library](https://archive.org/) items, [Mongodb Atlas](https://www.mongodb.com/), [Oracle](https://www.oracle.com/) (WIP), and [Snowflake](https://www.snowflake.com/en/). * As of Release 3.2.0, Swirl includes SearchProviders for [LittleSis.org](https://littlesis.org/) and [OpenSanctions.org](https://www.opensanctions.org/) entity searching. @@ -307,7 +308,7 @@ If you have the raw JSON of SearchProvider, install it by copying/pasting into t 3. Paste one SearchProvider's JSON at a time into the form and press the `POST` button 4. Swirl will respond with the finished SearchProvider -As of Swirl 3.2.0, you can copy/paste lists of SearchProviders into the endpoint, and Swirl will load them all. +As of Swirl 3.2.0, you can copy/paste lists of SearchProviders into the endpoint, and Swirl will load them all. ## Bulk Loading @@ -326,7 +327,7 @@ python swirl_load.py SearchProviders/provider-name.json -u admin -p your-admin-p ## Editing -Edit any SearchProvider by adding the `id` to the end of the `/swirl/searchproviders` URL. +Edit any SearchProvider by adding the `id` to the end of the `/swirl/searchproviders` URL. For example: `http://localhost:8000/swirl/searchproviders/1/` @@ -335,7 +336,7 @@ For example: `http://localhost:8000/swirl/searchproviders/1/` From here, you can use the form at the bottom of the page to: * DELETE this SearchProvider, forever -* Edit the configuration of the SearchProvider and `PUT` the changes +* Edit the configuration of the SearchProvider and `PUT` the changes ## Query Templating @@ -345,7 +346,7 @@ Most SearchProviders require a `query_template`. This is usually bound to `query "query_template": "{'$text': {'$search': '{query_string}'}}", ``` -This format is not actually JSON, but rather a string. The single quotes are required, so that the JSON can use double quotes. +This format is not actually JSON, but rather a string. The single quotes are required, so that the JSON can use double quotes. As of Swirl 3.2.0, MongoDB all use the new `query_template_json` field, which stores the template as JSON. For example, here is the new MongoDB `query_template_json`: @@ -371,18 +372,18 @@ The suggestion is that SearchProviders who are good for most any search be left ## Query Mappings -SearchProvider `query_mappings` are key/value pairs that define how to query a given SearchProvider. +SearchProvider `query_mappings` are key/value pairs that define how to query a given SearchProvider. They include field mappings and configurations that Swirl's processors (like the `AdaptiveQueryProcessor`) use to align the query with each SearchProvider's capabilities. The following table summarizes the current `query_mappings` options: -| Mapping Format | Meaning | Example | +| Mapping Format | Meaning | Example | | ---------- | ---------- | ---------- | | key = value | Replace `key` with `value` if the `key` is enclosed in braces in the `provider.query_template`. | ```"query_template": "{url}?cx={cx}&key={key}&q={query_string}","query_mappings": "cx=google-pse-key"``` | -| DATE_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if date sorting is specified in the search object. | `"query_mappings": "DATE_SORT=sort=date"` | -| RELEVANCY_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if relevancy sorting is specified in the search object. | `"query_mappings": "RELEANCY_SORT=sort=relevancy"` | -| PAGE=url-snippet | This identifies the string to insert into the URL for this SearchProvider for paging support. The specification should include either Swirl variable `RESULT_INDEX` or `RESULT_PAGE` which will be the result number (e.g. 11) or page number (e.g. 2) | `"query_mappings": "PAGE=start=RESULT_INDEX"` | +| DATE_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if date sorting is specified in the search object. | `"query_mappings": "DATE_SORT=sort=date"` | +| RELEVANCY_SORT=url-snippet | This identifies the string to insert into the URL for this SearchProvider if relevancy sorting is specified in the search object. | `"query_mappings": "RELEANCY_SORT=sort=relevancy"` | +| PAGE=url-snippet | This identifies the string to insert into the URL for this SearchProvider for paging support. The specification should include either Swirl variable `RESULT_INDEX` or `RESULT_PAGE` which will be the result number (e.g. 11) or page number (e.g. 2) | `"query_mappings": "PAGE=start=RESULT_INDEX"` | | NOT=True | If present, this SearchProvider supports simple, single NOT operators | elon musk NOT twitter | | NOT_CHAR=- | If present, this SearchProvider supports `-term` NOT operators | elon musk -twitter | @@ -399,7 +400,7 @@ For `query_mappings`, keys that appear in the `query_template` wrapped in braces "query_mappings": "cx=0c38029ddd002c006,DATE_SORT=sort=date,PAGE=start=RESULT_INDEX", ``` -At federation time, this becomes the following URL: +At federation time, this becomes the following URL: ``` shell https://www.googleapis.com/customsearch/v1?cx=0c38029ddd002c006&q=some_query_string @@ -462,7 +463,7 @@ The `credentials` property stores any required authentication information for th ### key=value format -This credential is bound to the URL that is used to execute searches. +This credential is bound to the URL that is used to execute searches. For example, from a Google PSE: @@ -491,7 +492,7 @@ X-Api-Keys are supported by the `RequestsGet` and `RequestsPost` connectors. The ### HTTPBasicAuth, HTTPDigestAuth, HTTPProxyAuth -These methods are supported by the `RequestsGet`, `ElasticSearch` and `OpenSearch` connectors. +These methods are supported by the `RequestsGet`, `ElasticSearch` and `OpenSearch` connectors. For example, from the [Solr with Auth SearchProvider](https://github.com/swirlai/swirl-search/blob/main/SearchProviders/solr_with_auth.json): @@ -515,7 +516,7 @@ Here is the `response_mappings` from a Google PSE: The following table summarizes the `response_mappings` options: -| Mapping | Source_JSONPath | Required? | Example | +| Mapping | Source_JSONPath | Required? | Example | | ---------- | ---------- | ---------- | ---------- | | FOUND | Number of results for a given query, for this SearchProvider, e.g. 1,413
Same as `RETRIEVED` if not specified | No | `searchInformation.totalResults=FOUND` | | RETRIEVED | Number of results returned for a given query, for this SearchProvider, e.g. 10
Length of the `RESULTS` list (see below) if not specified | No | `queries.request[0].count=RETRIEVED` | @@ -567,7 +568,7 @@ Swirl will automatically convert this format to a JSON array of dicts, with the ### Multiple Mappings -As of version 1.6, Swirl can map multiple SearchProvider fields to a single Swirl field, aggregating multiple responses in the PAYLOAD field as necessary. +As of version 1.6, Swirl can map multiple SearchProvider fields to a single Swirl field, aggregating multiple responses in the PAYLOAD field as necessary. For example: @@ -601,8 +602,8 @@ If only one field, `content` or `description`, are populated for a response, the The following table explains the `result_mappings` options: -| Mapping Format | Meaning | Example | -| ---------- | ---------- | ---------- | +| Mapping Format | Meaning | Example | +| ---------- | ---------- | ---------- | | swirl_key = source_key | This maps a key from the source provider's result list to Swirl's result list. The `source_key` may be a JSON path. | `body=_source.email` | | swirl_key = source_key1\|source_key2\|source_keyN | This maps multiple keys from the source provider's result list to Swirl's result list; as [noted above](#multiple-mappings) the first populated field is mapped and the rest are copied to the PAYLOAD | `body=content\|description,...` | | swirl_key='template {variable} etc' | This allows any number of source provider result fields to be turned into a string that is then copied to a Swirl field (like `body`) or the PAYLOAD. Commas (,) are not supported in the string at this time. | `'{x}: {y}'=title` | @@ -611,7 +612,7 @@ The following table explains the `result_mappings` options: | sw_btcconvert | An optional directive which will convert the provided Satoshi value to Bitcoin; it can be used anyplace in the template such as `result_mappings` | `sw_btcconvert()` | | NO_PAYLOAD | By default, Swirl copies all result keys from the SearchProvider to the PAYLOAD. If `NO_PAYLOAD` is specified, Swirl copies only the explicitly mapped fields.| `NO_PAYLOAD` | | FILE_SYSTEM | If specified, Swirl will assume that this SearchProvider is a file system and weight matches against the `body` higher. | `FILE_SYSTEM` | -| LC_URL | If specified, Swirl will convert the `url` field to lower case. | `LC_URL` | +| LC_URL | If specified, Swirl will convert the `url` field to lower case. | `LC_URL` | | BLOCK | As of Release 3.1.0, this feature is used exclusively by Swirl's RAG processing; that output appears in this `info` block of the Result object. | `BLOCK=ai_summary` | #### Date Published Display @@ -635,7 +636,7 @@ The `json_result` schema for each result in the Result list is defined by the `c [Result mixers](Developer-Reference.md#mixers-1) further manipulate and re-organize the data from multiple results. -The Result schema can be seen in [`swirl/models.py`](https://github.com/swirlai/swirl-search/tree/main/swirl/models.py) +The Result schema can be seen in [`swirl/models.py`](https://github.com/swirlai/swirl-search/tree/main/swirl/models.py) ## PAYLOAD Field diff --git a/requirements.txt b/requirements.txt index ef0020813..fbfee0411 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ amqp==5.2.0 annotated-types==0.6.0 -anyio==4.3.0 +anyio==4.2.0 asgiref==3.7.2 +asn1crypto==1.5.1 attrs==23.2.0 autobahn==23.6.2 Automat==22.10.0 @@ -12,7 +13,7 @@ bs4==0.0.2 cachetools==5.3.2 catalogue==2.0.10 celery==5.3.6 -certifi==2024.2.2 +certifi==2023.11.17 cffi==1.16.0 channels==4.0.0 channels-redis==4.2.0 @@ -27,33 +28,35 @@ confection==0.1.4 constantly==23.10.4 coreapi==2.3.3 coreschema==0.0.4 -cryptography==42.0.4 +cryptography==41.0.7 cssselect==1.2.0 cymem==2.0.8 -daphne==4.1.0 +daphne==4.0.0 distro==1.9.0 -Django==5.0.2 +Django==5.0.1 django-celery-beat==2.1.0 django-environ==0.11.2 django-rest-swagger==2.2.0 django-restframework==0.0.1 django-timezone-field==4.2.3 djangorestframework==3.14.0 -dnspython==2.6.1 +dnspython==2.5.0 docutils==0.20.1 drf-yasg==1.21.7 elastic-transport==8.12.0 elasticsearch==8.12.0 -google-api-core==2.17.1 -google-auth==2.28.1 -google-cloud-bigquery==3.17.2 +en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc +filelock==3.13.1 +google-api-core==2.16.1 +google-auth==2.27.0 +google-cloud-bigquery==3.17.1 google-cloud-core==2.4.1 google-crc32c==1.5.0 google-resumable-media==2.7.0 googleapis-common-protos==1.62.0 h11==0.14.0 -httpcore==1.0.4 -httpx==0.27.0 +httpcore==1.0.2 +httpx==0.26.0 hyperlink==21.0.0 idna==3.6 incremental==22.10.0 @@ -65,36 +68,37 @@ jsonpath-ng==1.6.1 kombu==5.3.5 langcodes==3.3.0 lxml==5.1.0 -MarkupSafe==2.1.5 +MarkupSafe==2.1.4 msal==1.26.0 msgpack==1.0.7 murmurhash==1.0.10 natsort==8.4.0 nltk==3.8.1 -numpy==1.26.4 -openai==1.12.0 +numpy==1.26.3 +openai==1.6.1 openapi-codec==1.3.2 opensearch-py==2.4.2 oracledb==2.0.1 packaging==23.2 pika==1.3.2 +platformdirs==3.11.0 ply==3.11 preshed==3.0.9 prompt-toolkit==3.0.43 -protobuf==4.25.3 +protobuf==4.25.2 psycopg2==2.9.9 pyahocorasick==2.0.0 pyasn1==0.5.1 pyasn1-modules==0.3.0 pycparser==2.21 -pydantic==2.6.1 -pydantic_core==2.16.2 +pydantic==2.6.0 +pydantic_core==2.16.1 PyJWT==2.8.0 -pymongo==4.6.2 -pyOpenSSL==24.0.0 +pymongo==4.6.1 +pyOpenSSL==23.3.0 python-crontab==3.0.0 python-dateutil==2.8.2 -pytz==2024.1 +pytz==2023.4 PyYAML==6.0.1 readability-lxml==0.8.1 redis==5.0.1 @@ -102,34 +106,37 @@ regex==2023.12.25 requests==2.31.0 rsa==4.9 service-identity==24.1.0 -setuptools==69.1.0 +setuptools==69.0.3 simplejson==3.19.2 six==1.16.0 smart-open==6.4.0 sniffio==1.3.0 +snowflake-connector-python==3.7.0 +sortedcontainers==2.4.0 soupsieve==2.5 -spacy==3.7.4 +spacy==3.7.2 spacy-legacy==3.0.12 spacy-loggers==1.0.5 sqlparse==0.4.4 srsly==2.4.8 statistics==1.0.3.5 -textblob==0.18.0.post0 -thinc==8.2.3 +textblob==0.17.1 +thinc==8.2.2 tika==2.6.0 -tiktoken==0.6.0 -tqdm==4.66.2 +tiktoken==0.5.2 +tomlkit==0.12.3 +tqdm==4.66.1 Twisted==23.10.0 txaio==23.1.1 typer==0.9.0 typing_extensions==4.9.0 -tzdata==2024.1 +tzdata==2023.4 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.0 vine==5.1.0 wasabi==1.1.2 wcwidth==0.2.13 weasel==0.3.4 whitenoise==6.6.0 xmltodict==0.13.0 -zope.interface==6.2 +zope.interface==6.1 diff --git a/swirl/connectors/__init__.py b/swirl/connectors/__init__.py index 51c367d9e..5d26c5bd9 100644 --- a/swirl/connectors/__init__.py +++ b/swirl/connectors/__init__.py @@ -16,6 +16,7 @@ from swirl.connectors.microsoft_graph import M365SharePointSites from swirl.connectors.microsoft_graph import MicrosoftTeams from swirl.connectors.mongodb import MongoDB +from swirl.connectors.snowflake import Snowflake from swirl.connectors.oracle import Oracle # uncomment the line below to enable PostgreSQL diff --git a/swirl/connectors/db_connector.py b/swirl/connectors/db_connector.py index 98625139b..990ebaeba 100644 --- a/swirl/connectors/db_connector.py +++ b/swirl/connectors/db_connector.py @@ -120,7 +120,7 @@ def validate_query(self, session=None): return False return True - + ######################################## def normalize_response(self): @@ -136,7 +136,7 @@ def normalize_response(self): if not self.response: # assume the connector took care of it return - + rows = self.response trimmed_rows = [] @@ -146,7 +146,7 @@ def normalize_response(self): n_field = 0 if self.column_names: for field in column_names: - # to handle None columns + # to handle None columns e.g. Snowflake if row[n_field]: dict_row[field] = row[n_field] else: @@ -168,3 +168,4 @@ def normalize_response(self): self.retrieved = retrieved self.results = trimmed_rows return + diff --git a/swirl/connectors/snowflake.py b/swirl/connectors/snowflake.py new file mode 100644 index 000000000..9515eec55 --- /dev/null +++ b/swirl/connectors/snowflake.py @@ -0,0 +1,92 @@ +''' +@author: Sid Probstein +@contact: sid@swirl.today +''' + +from sys import path +from os import environ + +import snowflake.connector +from snowflake.connector import ProgrammingError + +import json + +import django + +from swirl.utils import swirl_setdir +path.append(swirl_setdir()) # path to settings.py file +environ.setdefault('DJANGO_SETTINGS_MODULE', 'swirl_server.settings') +django.setup() + +from celery.utils.log import get_task_logger +from logging import DEBUG +logger = get_task_logger(__name__) +# logger.setLevel(DEBUG) + +from swirl.connectors.db_connector import DBConnector +from swirl.connectors.utils import bind_query_mappings + +######################################## +######################################## + +class Snowflake(DBConnector): + + type = "Snowflake" + + ######################################## + + def execute_search(self, session=None): + + logger.debug(f"{self}: execute_search()") + + if self.provider.credentials: + if ':' in self.provider.credentials: + credlist = self.provider.credentials.split(':') + if len(credlist) == 4: + username = credlist[0] + password = credlist[1] + database = credlist[2] + warehouse = credlist[3] + else: + self.warning("Invalid credentials, should be: username:password:database:warehouse") + else: + self.warning("No credentials!") + account = self.provider.url + + try: + # Create a new connection + conn = snowflake.connector.connect(user=username, password=password, account=account) + cursor = conn.cursor() + cursor.execute(f"USE WAREHOUSE {warehouse}") + cursor.execute(f"USE DATABASE {database}") + + cursor.execute(self.count_query) + count_result = cursor.fetchone() + found = count_result[0] if count_result else 0 + if found == 0: + self.message(f"Retrieved 0 of 0 results from: {self.provider.name}") + self.status = 'READY' + self.found = 0 + self.retrieved = 0 + return + + cursor.execute(self.query_to_provider) + self.column_names = [col[0].lower() for col in cursor.description] + results = cursor.fetchall() + + except ProgrammingError as err: + self.error(f"{err} querying {self.type}") + self.status = 'ERR' + cursor.close() + conn.close() + return + + self.response = list(results) + + cursor.close() + conn.close() + + self.found = found + self.retrieved = self.provider.results_per_query + return + diff --git a/swirl/models.py b/swirl/models.py index 94d756d85..a5576d9b0 100644 --- a/swirl/models.py +++ b/swirl/models.py @@ -82,7 +82,8 @@ class SearchProvider(models.Model): ('M365SharePointSites', 'M365 SharePoint Sites'), ('MicrosoftTeams', 'Microsoft Teams'), ('MongoDB', 'MongoDB'), - ('Oracle','Oracle') + ('Oracle','Oracle'), + ('Snowflake','Snowflake') ] connector = models.CharField(max_length=200, default='RequestsGet', choices=CONNECTOR_CHOICES) url = models.CharField(max_length=2048, default=str, blank=True) From 87abd2ac221a154b5a67dbd5b52d4b233a8c0814 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Tue, 27 Feb 2024 18:28:00 -0500 Subject: [PATCH 13/44] bring it back in line w/ main --- requirements.txt | 65 +++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/requirements.txt b/requirements.txt index fbfee0411..ef0020813 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ amqp==5.2.0 annotated-types==0.6.0 -anyio==4.2.0 +anyio==4.3.0 asgiref==3.7.2 -asn1crypto==1.5.1 attrs==23.2.0 autobahn==23.6.2 Automat==22.10.0 @@ -13,7 +12,7 @@ bs4==0.0.2 cachetools==5.3.2 catalogue==2.0.10 celery==5.3.6 -certifi==2023.11.17 +certifi==2024.2.2 cffi==1.16.0 channels==4.0.0 channels-redis==4.2.0 @@ -28,35 +27,33 @@ confection==0.1.4 constantly==23.10.4 coreapi==2.3.3 coreschema==0.0.4 -cryptography==41.0.7 +cryptography==42.0.4 cssselect==1.2.0 cymem==2.0.8 -daphne==4.0.0 +daphne==4.1.0 distro==1.9.0 -Django==5.0.1 +Django==5.0.2 django-celery-beat==2.1.0 django-environ==0.11.2 django-rest-swagger==2.2.0 django-restframework==0.0.1 django-timezone-field==4.2.3 djangorestframework==3.14.0 -dnspython==2.5.0 +dnspython==2.6.1 docutils==0.20.1 drf-yasg==1.21.7 elastic-transport==8.12.0 elasticsearch==8.12.0 -en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc -filelock==3.13.1 -google-api-core==2.16.1 -google-auth==2.27.0 -google-cloud-bigquery==3.17.1 +google-api-core==2.17.1 +google-auth==2.28.1 +google-cloud-bigquery==3.17.2 google-cloud-core==2.4.1 google-crc32c==1.5.0 google-resumable-media==2.7.0 googleapis-common-protos==1.62.0 h11==0.14.0 -httpcore==1.0.2 -httpx==0.26.0 +httpcore==1.0.4 +httpx==0.27.0 hyperlink==21.0.0 idna==3.6 incremental==22.10.0 @@ -68,37 +65,36 @@ jsonpath-ng==1.6.1 kombu==5.3.5 langcodes==3.3.0 lxml==5.1.0 -MarkupSafe==2.1.4 +MarkupSafe==2.1.5 msal==1.26.0 msgpack==1.0.7 murmurhash==1.0.10 natsort==8.4.0 nltk==3.8.1 -numpy==1.26.3 -openai==1.6.1 +numpy==1.26.4 +openai==1.12.0 openapi-codec==1.3.2 opensearch-py==2.4.2 oracledb==2.0.1 packaging==23.2 pika==1.3.2 -platformdirs==3.11.0 ply==3.11 preshed==3.0.9 prompt-toolkit==3.0.43 -protobuf==4.25.2 +protobuf==4.25.3 psycopg2==2.9.9 pyahocorasick==2.0.0 pyasn1==0.5.1 pyasn1-modules==0.3.0 pycparser==2.21 -pydantic==2.6.0 -pydantic_core==2.16.1 +pydantic==2.6.1 +pydantic_core==2.16.2 PyJWT==2.8.0 -pymongo==4.6.1 -pyOpenSSL==23.3.0 +pymongo==4.6.2 +pyOpenSSL==24.0.0 python-crontab==3.0.0 python-dateutil==2.8.2 -pytz==2023.4 +pytz==2024.1 PyYAML==6.0.1 readability-lxml==0.8.1 redis==5.0.1 @@ -106,37 +102,34 @@ regex==2023.12.25 requests==2.31.0 rsa==4.9 service-identity==24.1.0 -setuptools==69.0.3 +setuptools==69.1.0 simplejson==3.19.2 six==1.16.0 smart-open==6.4.0 sniffio==1.3.0 -snowflake-connector-python==3.7.0 -sortedcontainers==2.4.0 soupsieve==2.5 -spacy==3.7.2 +spacy==3.7.4 spacy-legacy==3.0.12 spacy-loggers==1.0.5 sqlparse==0.4.4 srsly==2.4.8 statistics==1.0.3.5 -textblob==0.17.1 -thinc==8.2.2 +textblob==0.18.0.post0 +thinc==8.2.3 tika==2.6.0 -tiktoken==0.5.2 -tomlkit==0.12.3 -tqdm==4.66.1 +tiktoken==0.6.0 +tqdm==4.66.2 Twisted==23.10.0 txaio==23.1.1 typer==0.9.0 typing_extensions==4.9.0 -tzdata==2023.4 +tzdata==2024.1 uritemplate==4.1.1 -urllib3==2.2.0 +urllib3==2.2.1 vine==5.1.0 wasabi==1.1.2 wcwidth==0.2.13 weasel==0.3.4 whitenoise==6.6.0 xmltodict==0.13.0 -zope.interface==6.1 +zope.interface==6.2 From e0b45eb7f907be073c717549cfcf93bf736b3208 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Tue, 27 Feb 2024 18:33:47 -0500 Subject: [PATCH 14/44] put back snowflak and requirements --- requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/requirements.txt b/requirements.txt index ef0020813..ac957a379 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ amqp==5.2.0 annotated-types==0.6.0 anyio==4.3.0 asgiref==3.7.2 +asn1crypto==1.5.1 attrs==23.2.0 autobahn==23.6.2 Automat==22.10.0 @@ -44,6 +45,7 @@ docutils==0.20.1 drf-yasg==1.21.7 elastic-transport==8.12.0 elasticsearch==8.12.0 +filelock==3.13.1 google-api-core==2.17.1 google-auth==2.28.1 google-cloud-bigquery==3.17.2 @@ -78,6 +80,7 @@ opensearch-py==2.4.2 oracledb==2.0.1 packaging==23.2 pika==1.3.2 +platformdirs==3.11.0 ply==3.11 preshed==3.0.9 prompt-toolkit==3.0.43 @@ -107,6 +110,8 @@ simplejson==3.19.2 six==1.16.0 smart-open==6.4.0 sniffio==1.3.0 +snowflake-connector-python==3.7.1 +sortedcontainers==2.4.0 soupsieve==2.5 spacy==3.7.4 spacy-legacy==3.0.12 @@ -118,6 +123,7 @@ textblob==0.18.0.post0 thinc==8.2.3 tika==2.6.0 tiktoken==0.6.0 +tomlkit==0.12.4 tqdm==4.66.2 Twisted==23.10.0 txaio==23.1.1 From 9951a2a83e6e88a3bb01476438d1ccb670f87250 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Tue, 27 Feb 2024 18:42:49 -0500 Subject: [PATCH 15/44] remove extra spaces --- .github/workflows/qa-suite.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/qa-suite.yml b/.github/workflows/qa-suite.yml index 23acc9cda..c9373cac4 100644 --- a/.github/workflows/qa-suite.yml +++ b/.github/workflows/qa-suite.yml @@ -45,6 +45,10 @@ jobs: with: python-version: '3.12.2' cache: 'pip' # caching pip stuff + - name: Install Chromium + run: | + sudo apt-get update + sudo apt-get install -y chromium-browser - name: Install Swirl run: ./install.sh - name: Update apt From 03e1dd463f76146eadbaef1b7ec379218b0d2309 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 09:03:15 -0500 Subject: [PATCH 16/44] new workflow to target the wip-tests docker image for troubleshooting test scenarios --- .github/workflows/qa-wip.yml | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 .github/workflows/qa-wip.yml diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml new file mode 100644 index 000000000..5c39a7dde --- /dev/null +++ b/.github/workflows/qa-wip.yml @@ -0,0 +1,49 @@ +# Workflow for running WIP tests (the "wip-tests" Docker image) in the GitHub runners +name: QA Suite + +on: + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + + build: + runs-on: ubuntu-latest + + steps: + - name: Set Up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12.2' + cache: 'pip' # caching pip stuff + - name: Install Chromium + run: | + sudo apt-get update + sudo apt-get install -y chromium-browser + - name: Install Swirl + run: ./install.sh + - name: Update apt + run: sudo apt -o Acquire::Retries=3 update + - name: Upgrade Ubuntu to Latest Patches + run: sudo apt upgrade -y + - name: Install redis-server + run: sudo apt install -y redis-server + - name: Set Up Swirl + run: python swirl.py setup + - name: Start Swirl + run: python swirl.py start + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - name: Run the WIP Tests Suite # Update the "behave --tags=" below to target a set of tagged WIP Scenarios + run: docker run --net=host -t swirlai/swirl-search-qa:wip-tests sh -c "behave --tags=integrated_ui" + - name: Upload Log Files + if: always() + uses: actions/upload-artifact@v4 + with: + name: log-files + path: | + logs/ + /var/log/syslog* From 838c1b552d687a3a37de5fe01d7aad512b5fbfcb Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 09:07:37 -0500 Subject: [PATCH 17/44] update name value for new workflow...d'oh --- .github/workflows/qa-wip.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 5c39a7dde..7463705cb 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -1,5 +1,5 @@ # Workflow for running WIP tests (the "wip-tests" Docker image) in the GitHub runners -name: QA Suite +name: QA WIP Tests on: # Allows you to run this workflow manually from the Actions tab From fec0dda7d13c86e5a21108aa261d2876861eb77c Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Wed, 28 Feb 2024 10:38:50 -0500 Subject: [PATCH 18/44] version of this for main --- .github/workflows/docker-image-sa.yml | 55 +++++++++++++++++++++++++++ DevUtils/docker/Dockerfile.sa | 25 ++++++++++++ DevUtils/docker/sw-start-sa.sh | 18 +++++++++ 3 files changed, 98 insertions(+) create mode 100644 .github/workflows/docker-image-sa.yml create mode 100644 DevUtils/docker/Dockerfile.sa create mode 100644 DevUtils/docker/sw-start-sa.sh diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml new file mode 100644 index 000000000..5f18382ed --- /dev/null +++ b/.github/workflows/docker-image-sa.yml @@ -0,0 +1,55 @@ +name: SA DockerBuild + +on: + # Allows manual run of this workflow from the Actions tab (on any branch) + workflow_dispatch: + +# For debugging +# on: +# push: +# branches: '' + +jobs: + + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout the code + uses: actions/checkout@v4 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - name: Docker Diagnostics + run: | + docker system df + - name: Host System Diagnostics + run: | + df -h + du -sh * + - name: Docker Cleanup + run: | + docker system prune -af + docker volume prune -f + docker builder prune -f + - name: Builder Bootstrap + run: docker buildx create --name devBuilder --use --bootstrap + - name: Build the Docker Image + run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:latest-sa --push . + + - name: Update repo description + uses: peter-evans/dockerhub-description@v4 + with: + username: ${{ secrets.DOCKER_USERNAME_X }} + password: ${{ secrets.DOCKER_PASSWORD_X }} + repository: swirlai/swirl-search + - name: Upload log files + if: always() + uses: actions/upload-artifact@v4 + with: + name: log-files + path: | + logs/ + /var/log/syslog* diff --git a/DevUtils/docker/Dockerfile.sa b/DevUtils/docker/Dockerfile.sa new file mode 100644 index 000000000..0a972759f --- /dev/null +++ b/DevUtils/docker/Dockerfile.sa @@ -0,0 +1,25 @@ +# Start from the base image that can run your app +FROM swirlai/swirl-search:latest + +# Install Redis +RUN apt-get update && apt-get install -y redis-server + + +# Copy a custom script that will start both Redis and your app +COPY DevUtils/docker/sw-start-sa.sh /sw-start-sa.sh +RUN chmod +x /sw-start-sa.sh + +# Set environment variables (if they are static) +# If they need to be dynamic, you can pass them at runtime + +ENV CELERY_BROKER_URL redis://localhost:6379/0 +ENV CELERY_RESULT_BACKEND redis://localhost:6379/0 +ENV OPENAI_API_KEY your_openai_api_key +ENV MSAL_CB_PORT 8000 +ENV MSAL_HOST localhost + +# Expose the port your app runs on +EXPOSE 8000 + +# Start script.di +CMD ["/sw-start-sa.sh"] diff --git a/DevUtils/docker/sw-start-sa.sh b/DevUtils/docker/sw-start-sa.sh new file mode 100644 index 000000000..2cd160e9d --- /dev/null +++ b/DevUtils/docker/sw-start-sa.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Start Redis in the background +redis-server & + +./install.sh + +# Your original command to setup and start the application +rm -fr ./.swirl && python swirl.py setup && mkdir -p static/api/config && +/usr/bin/jq ".default" ./config-swirl-demo.db.json | sed -e "s//$MSAL_APP_ID/" \ + -e "s//$MSAL_TENANT_ID/" \ + -e "s//$MSAL_CB_PORT/" \ + -e "s//$MSAL_HOST/" > static/api/config/default && +python swirl.py start celery-worker celery-beats && +daphne -b 0.0.0.0 -p 8000 swirl_server.asgi:application + +# Keep the container running (if needed) +wait From cfeeb163d11f1c8c98d7b476797ca0d0362d73d9 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Wed, 28 Feb 2024 11:31:40 -0500 Subject: [PATCH 19/44] remove site doc update --- .github/workflows/docker-image-sa.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml index 5f18382ed..5629fb02a 100644 --- a/.github/workflows/docker-image-sa.yml +++ b/.github/workflows/docker-image-sa.yml @@ -1,4 +1,4 @@ -name: SA DockerBuild +name: SA DockerBuild on: # Allows manual run of this workflow from the Actions tab (on any branch) @@ -39,12 +39,6 @@ jobs: - name: Build the Docker Image run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:latest-sa --push . - - name: Update repo description - uses: peter-evans/dockerhub-description@v4 - with: - username: ${{ secrets.DOCKER_USERNAME_X }} - password: ${{ secrets.DOCKER_PASSWORD_X }} - repository: swirlai/swirl-search - name: Upload log files if: always() uses: actions/upload-artifact@v4 From ba77949bb1932f9be8c254f51c5f49d37d3a3a8f Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 13:12:20 -0500 Subject: [PATCH 20/44] fix testing badge on main branch Readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 47522fef8..27e8f19a4 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg?color=088395&logoColor=blue&style=flat-square)](https://opensource.org/license/apache-2-0/) [![GitHub Release](https://img.shields.io/github/v/release/swirlai/swirl-search?style=flat-square&color=8DDFCB&label=Release)](https://github.com/swirlai/swirl-search/releases) [![Docker Build](https://github.com/swirlai/swirl-search/actions/workflows/docker-image.yml/badge.svg?style=flat-square&branch=main)](https://github.com/swirlai/swirl-search/actions/workflows/docker-image.yml) -[![Tests](https://github.com/swirlai/swirl-search/actions/workflows/smoke-tests.yml/badge.svg?branch=main)](https://github.com/swirlai/swirl-search/actions/workflows/smoke-tests.yml) +[![Tests](https://github.com/swirlai/swirl-search/actions/workflows/qa-suite.yml/badge.svg?branch=main)](https://github.com/swirlai/swirl-search/actions/workflows/qa-suite.yml) [![Static Badge](https://img.shields.io/badge/Join%20Our%20Slack-0E21A0?logo=slack)](https://join.slack.com/t/swirlmetasearch/shared_invite/zt-1qk7q02eo-kpqFAbiZJGOdqgYVvR1sfw) [![Website](https://img.shields.io/badge/Swirl.Today-241468)](https://www.swirl.today) From 1c5d0785fcb9f03c3c8c935755c2a80f4c67ff14 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Wed, 28 Feb 2024 14:03:45 -0500 Subject: [PATCH 21/44] shell scrip to generate command and add client id to docker --- DevUtils/azure/gen_sw_aci.sh | 40 +++++++++++++++++++++++++++++++++++ DevUtils/docker/Dockerfile.sa | 1 + 2 files changed, 41 insertions(+) create mode 100755 DevUtils/azure/gen_sw_aci.sh diff --git a/DevUtils/azure/gen_sw_aci.sh b/DevUtils/azure/gen_sw_aci.sh new file mode 100755 index 000000000..65e5d8d1f --- /dev/null +++ b/DevUtils/azure/gen_sw_aci.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Usage: +# gen_sw_az_deploy.sh +# +# Generate command to install swirl in azure containery +# + +PROG=`basename $0` + +port=8000 +sizemg=8 + +read -p "$PROG Enter location (e.g. eastus): " location +read -p "$PROG Enter 'o' for open souce or 'e' for enterprise: " dtype +read -p "$PROG Enter application name: " dname +read -p "$PROG Enter your OpenAI Key ID: " openai_key +read -p "Enter your Azure Registered App ID: " app_id + +if [ "$dtype" = "e" ]; then + image="swirlai/swirl-search-internal:develop-sa" +elif [ "$dtype" = "o" ]; then + image="swirlai/swirl-search:develop-sa" +else + echo $PROG "err unknown deployment type $dtype" + exit 1 +fi + +location=${location:-eastus} + +echo az login +echo az group create --name sw$dtype-$dname-rg --location $location +echo az container create \ + --resource-group sw$dtype-$dname-rg \ + --name sw$dtype-$dname \ + --dns-name-label sw$dtype-$dname \ + --ports $port \ + --image $image \ + --environment-variables MSAL_APP_ID=$app_id MSAL_CB_PORT=$port MSAL_HOST=sw$dtype-$dname.$location.azurecontainer.io ALLOWED_HOSTS=sw$dtype-$dname.$location.azurecontainer.io OPENAI_API_KEY=$openai_key \ + --memory $sizemg diff --git a/DevUtils/docker/Dockerfile.sa b/DevUtils/docker/Dockerfile.sa index 0a972759f..c1460a66e 100644 --- a/DevUtils/docker/Dockerfile.sa +++ b/DevUtils/docker/Dockerfile.sa @@ -17,6 +17,7 @@ ENV CELERY_RESULT_BACKEND redis://localhost:6379/0 ENV OPENAI_API_KEY your_openai_api_key ENV MSAL_CB_PORT 8000 ENV MSAL_HOST localhost +ENV MSAL_APP_ID '' # Expose the port your app runs on EXPOSE 8000 From 90500e7f9f481da6f1df281459d963071ea08248 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 15:28:35 -0500 Subject: [PATCH 22/44] make the behave tags a user input in wip-tests.yml --- .github/workflows/qa-wip.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 7463705cb..6850c6f14 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -2,8 +2,13 @@ name: QA WIP Tests on: - # Allows you to run this workflow manually from the Actions tab + # Allows you to run this workflow manually from the Actions tab and input the behave tag(s) to run workflow_dispatch: + inputs: + behave_tags: + description: 'Behave tags to run' + required: true + default: 'estest' # Default tag if none specified jobs: @@ -37,8 +42,8 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - name: Run the WIP Tests Suite # Update the "behave --tags=" below to target a set of tagged WIP Scenarios - run: docker run --net=host -t swirlai/swirl-search-qa:wip-tests sh -c "behave --tags=integrated_ui" + - name: Run the WIP Tests Suite + run: docker run --net=host -t swirlai/swirl-search-qa:wip-tests sh -c "behave --tags=${{ github.event.inputs.behave_tags }}" - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 From 5bca309d6e20ec4cfe7d4b4938c7bbf68a194c43 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 15:33:50 -0500 Subject: [PATCH 23/44] restore missing step to qa-wip.yml file --- .github/workflows/qa-wip.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 6850c6f14..94f514e2c 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -16,6 +16,8 @@ jobs: runs-on: ubuntu-latest steps: + - name: Checkout the Code + uses: actions/checkout@v4 - name: Set Up Python uses: actions/setup-python@v5 with: @@ -47,6 +49,7 @@ jobs: - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 + with: name: log-files path: | From d7b705404b8d054b5d467d396dcfccb9236c593c Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 15:34:33 -0500 Subject: [PATCH 24/44] remove extra newline --- .github/workflows/qa-wip.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 94f514e2c..d00eeb280 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -49,7 +49,6 @@ jobs: - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 - with: name: log-files path: | From 8746494ae1b2baa3446f9ce8d79e3accbb980ed2 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Wed, 28 Feb 2024 16:10:25 -0500 Subject: [PATCH 25/44] update log file output of qa-wip.yml to include screen grabs from test runs --- .github/workflows/qa-wip.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index d00eeb280..d722247eb 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -46,11 +46,12 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} - name: Run the WIP Tests Suite run: docker run --net=host -t swirlai/swirl-search-qa:wip-tests sh -c "behave --tags=${{ github.event.inputs.behave_tags }}" - - name: Upload Log Files + - name: Upload Log Files and Screenshots if: always() uses: actions/upload-artifact@v4 with: - name: log-files + name: test-artifacts path: | logs/ /var/log/syslog* + *.png From 72681a387d5be8627fe2c5d94929fd62499be9a2 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 09:26:28 -0500 Subject: [PATCH 26/44] update artifact upload syntax to support new screenshots/ dir --- .github/workflows/qa-wip.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index d722247eb..bd21bfffa 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -54,4 +54,5 @@ jobs: path: | logs/ /var/log/syslog* - *.png + screenshots/*.png + \ No newline at end of file From 00884f1e4f17597960197a06f35dfc8a25f5c4ba Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Thu, 29 Feb 2024 10:33:14 -0500 Subject: [PATCH 27/44] update to new x branch for Dmitry's version --- Dockerfile.fork.spg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.fork.spg b/Dockerfile.fork.spg index a16044541..89c151f4a 100644 --- a/Dockerfile.fork.spg +++ b/Dockerfile.fork.spg @@ -3,7 +3,7 @@ FROM node:14.17.5 AS builder RUN git clone https://bitbucket.org/swirl-spyglass/spyglass.git /usr/src/spyglass WORKDIR /usr/src/spyglass/ui -RUN git checkout chart-feature +RUN git checkout chart-feature-edited RUN git pull RUN npm install -g npm@7.21.1 RUN npm install From 7df77555aa58e7e0c44a6670030d11339fe4fb81 Mon Sep 17 00:00:00 2001 From: dnicodemus-la Date: Thu, 29 Feb 2024 14:39:28 -0500 Subject: [PATCH 28/44] Support for EZ AZ for main branch --- DevUtils/azure/gen_sw_aci.sh | 40 ---------------------------------- DevUtils/docker/Dockerfile.sa | 25 ++++++++++++++------- DevUtils/docker/sw-start-sa.sh | 12 ++++++++-- DevUtils/nginx/nginx.conf | 28 ++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 50 deletions(-) delete mode 100755 DevUtils/azure/gen_sw_aci.sh create mode 100644 DevUtils/nginx/nginx.conf diff --git a/DevUtils/azure/gen_sw_aci.sh b/DevUtils/azure/gen_sw_aci.sh deleted file mode 100755 index 65e5d8d1f..000000000 --- a/DevUtils/azure/gen_sw_aci.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# -# Usage: -# gen_sw_az_deploy.sh -# -# Generate command to install swirl in azure containery -# - -PROG=`basename $0` - -port=8000 -sizemg=8 - -read -p "$PROG Enter location (e.g. eastus): " location -read -p "$PROG Enter 'o' for open souce or 'e' for enterprise: " dtype -read -p "$PROG Enter application name: " dname -read -p "$PROG Enter your OpenAI Key ID: " openai_key -read -p "Enter your Azure Registered App ID: " app_id - -if [ "$dtype" = "e" ]; then - image="swirlai/swirl-search-internal:develop-sa" -elif [ "$dtype" = "o" ]; then - image="swirlai/swirl-search:develop-sa" -else - echo $PROG "err unknown deployment type $dtype" - exit 1 -fi - -location=${location:-eastus} - -echo az login -echo az group create --name sw$dtype-$dname-rg --location $location -echo az container create \ - --resource-group sw$dtype-$dname-rg \ - --name sw$dtype-$dname \ - --dns-name-label sw$dtype-$dname \ - --ports $port \ - --image $image \ - --environment-variables MSAL_APP_ID=$app_id MSAL_CB_PORT=$port MSAL_HOST=sw$dtype-$dname.$location.azurecontainer.io ALLOWED_HOSTS=sw$dtype-$dname.$location.azurecontainer.io OPENAI_API_KEY=$openai_key \ - --memory $sizemg diff --git a/DevUtils/docker/Dockerfile.sa b/DevUtils/docker/Dockerfile.sa index c1460a66e..6159c950a 100644 --- a/DevUtils/docker/Dockerfile.sa +++ b/DevUtils/docker/Dockerfile.sa @@ -1,17 +1,26 @@ # Start from the base image that can run your app FROM swirlai/swirl-search:latest -# Install Redis -RUN apt-get update && apt-get install -y redis-server +# Install Nginx and Redis +RUN apt-get update && \ + apt-get install -y nginx redis-server openssl && \ + rm -rf /var/lib/apt/lists/* +# Generate a self-signed SSL certificate (For production, use a certificate from a trusted CA) +RUN mkdir -p /etc/nginx/ssl && \ + openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout /etc/nginx/ssl/nginx.key -out /etc/nginx/ssl/nginx.crt \ + -subj "/C=US/ST=Denial/L=Springfield/O=Dis/CN=localhost" -# Copy a custom script that will start both Redis and your app +# Copy the Nginx configuration +COPY DevUtils/nginx/nginx.conf /etc/nginx/sites-available/default + +# Copy a custom script that will start Nginx, Redis, and your app COPY DevUtils/docker/sw-start-sa.sh /sw-start-sa.sh RUN chmod +x /sw-start-sa.sh +COPY install.sh /install.sh # Set environment variables (if they are static) -# If they need to be dynamic, you can pass them at runtime - ENV CELERY_BROKER_URL redis://localhost:6379/0 ENV CELERY_RESULT_BACKEND redis://localhost:6379/0 ENV OPENAI_API_KEY your_openai_api_key @@ -19,8 +28,8 @@ ENV MSAL_CB_PORT 8000 ENV MSAL_HOST localhost ENV MSAL_APP_ID '' -# Expose the port your app runs on -EXPOSE 8000 +# Expose ports for HTTP and HTTPS +EXPOSE 80 443 -# Start script.di +# Start script CMD ["/sw-start-sa.sh"] diff --git a/DevUtils/docker/sw-start-sa.sh b/DevUtils/docker/sw-start-sa.sh index 2cd160e9d..9df1558dd 100644 --- a/DevUtils/docker/sw-start-sa.sh +++ b/DevUtils/docker/sw-start-sa.sh @@ -3,14 +3,22 @@ # Start Redis in the background redis-server & -./install.sh +service nginx start + +echo "$PROG Copying: .env.dist -> .env" +cp .env.dist .env + +echo "$PROG Copying: db.sqlite3.dist -> db.sqlite3" +cp db.sqlite3.dist db.sqlite3 # Your original command to setup and start the application rm -fr ./.swirl && python swirl.py setup && mkdir -p static/api/config && /usr/bin/jq ".default" ./config-swirl-demo.db.json | sed -e "s//$MSAL_APP_ID/" \ -e "s//$MSAL_TENANT_ID/" \ + -e "s/http:\/\//https:\/\//" \ -e "s//$MSAL_CB_PORT/" \ - -e "s//$MSAL_HOST/" > static/api/config/default && + -e "s//$MSAL_HOST/" \ + -e "s/ws:/wss:/" > static/api/config/default && python swirl.py start celery-worker celery-beats && daphne -b 0.0.0.0 -p 8000 swirl_server.asgi:application diff --git a/DevUtils/nginx/nginx.conf b/DevUtils/nginx/nginx.conf new file mode 100644 index 000000000..ebd5dc9c4 --- /dev/null +++ b/DevUtils/nginx/nginx.conf @@ -0,0 +1,28 @@ +server { + listen 80; + server_name localhost; + return 301 https://$host$request_uri; +} + +server { + listen 443 ssl; + server_name localhost; + + ssl_certificate /etc/nginx/ssl/nginx.crt; + ssl_certificate_key /etc/nginx/ssl/nginx.key; + + location / { + proxy_pass http://localhost:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # WebSocket specific + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } +} + + From 7e42879643881e3ed7cd98e990970d9fdc7a0cdf Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 15:48:39 -0500 Subject: [PATCH 29/44] sequence the 2 docker-related workflow files --- .github/workflows/docker-image-sa.yml | 32 ++++++++++++++++++++++++++- .github/workflows/docker-image.yml | 18 +++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml index 5629fb02a..da0b55e11 100644 --- a/.github/workflows/docker-image-sa.yml +++ b/.github/workflows/docker-image-sa.yml @@ -1,7 +1,11 @@ name: SA DockerBuild on: - # Allows manual run of this workflow from the Actions tab (on any branch) + workflow_run: + workflows: [Docker Build] + types: + - completed + # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # For debugging @@ -12,9 +16,35 @@ on: jobs: build: + if: (github.event_name == 'workflow_dispatch') || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') runs-on: ubuntu-latest steps: + - name: Download Branch and run_id Artifacts + uses: dawidd6/action-download-artifact@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + workflow: docker-image.yml + name: branch-info-${{ github.event_name == 'workflow_run' && github.event.workflow_run.id || github.run_id }} + path: ./artifacts + continue-on-error: true # Allow the step to fail without stopping the workflow + - name: Determine Branch for Checkout + id: determine_branch + run: | + if [[ -f ./artifacts/branch.txt && -f ./artifacts/run_id.txt ]]; then + echo "branch=$(cat ./artifacts/branch.txt)" >> $GITHUB_ENV + echo "original_run_id=$(cat ./artifacts/run_id.txt)" >> $GITHUB_ENV + else + BRANCH_NAME=$(echo $GITHUB_REF | cut -d "/" -f 3) + echo "branch=$BRANCH_NAME" >> $GITHUB_ENV + fi + - name: Print Branch to be Checked Out + run: | + echo "Branch to checkout: ${{ env.branch }}" + - name: Checkout the Code + uses: actions/checkout@v4 + with: + ref: ${{ env.branch }} - name: Checkout the code uses: actions/checkout@v4 - name: Login to Docker Hub diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 82cf6e6dc..b037ef1a1 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -44,6 +44,24 @@ jobs: username: ${{ secrets.DOCKER_USERNAME_X }} password: ${{ secrets.DOCKER_PASSWORD_X }} repository: swirlai/swirl-search + - name: Create Artifacts Directory + run: mkdir -p artifacts + - name: Set Branch Name + id: extract_branch + run: | + BRANCH_NAME=$(echo $GITHUB_REF | cut -d "/" -f 3) + echo "branch=$BRANCH_NAME" >> $GITHUB_ENV + - name: Write Branch and run_id to File + run: | + echo "${{ env.branch }}" > ./artifacts/branch.txt + echo "${{ github.run_id }}" > ./artifacts/run_id.txt + - name: Upload Branch and run_id Files as Artifacts + uses: actions/upload-artifact@v4 + with: + name: branch-info-${{ github.run_id }} + path: | + ./artifacts/branch.txt + ./artifacts/run_id.txt - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 From 4cb1604b65d163145ed1e8d383c03f420554b096 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 17:26:09 -0500 Subject: [PATCH 30/44] update qa workflows to include ChromeDriver install --- .github/workflows/qa-suite.yml | 8 ++++++++ .github/workflows/qa-wip.yml | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/.github/workflows/qa-suite.yml b/.github/workflows/qa-suite.yml index c9373cac4..af663878e 100644 --- a/.github/workflows/qa-suite.yml +++ b/.github/workflows/qa-suite.yml @@ -49,6 +49,14 @@ jobs: run: | sudo apt-get update sudo apt-get install -y chromium-browser + - name: Install ChromeDriver + run: | + CHROMIUM_VERSION=$(chromium-browser --version | grep -oE "[0-9\.]{2,}" | cut -d'.' -f1-3) + CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROMIUM_VERSION") + curl -L -O "https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip" + unzip chromedriver_linux64.zip + sudo mv chromedriver /usr/bin/chromedriver + sudo chmod +x /usr/bin/chromedriver - name: Install Swirl run: ./install.sh - name: Update apt diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index bd21bfffa..f8ae3d407 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -27,6 +27,14 @@ jobs: run: | sudo apt-get update sudo apt-get install -y chromium-browser + - name: Install ChromeDriver + run: | + CHROMIUM_VERSION=$(chromium-browser --version | grep -oE "[0-9\.]{2,}" | cut -d'.' -f1-3) + CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROMIUM_VERSION") + curl -L -O "https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip" + unzip chromedriver_linux64.zip + sudo mv chromedriver /usr/bin/chromedriver + sudo chmod +x /usr/bin/chromedriver - name: Install Swirl run: ./install.sh - name: Update apt From e97a19218344784de85594e04832c18c85221bc4 Mon Sep 17 00:00:00 2001 From: Erik Spears <98238295+erikspears@users.noreply.github.com> Date: Thu, 29 Feb 2024 17:46:12 -0500 Subject: [PATCH 31/44] Revert "update qa workflows to include ChromeDriver install" --- .github/workflows/qa-suite.yml | 8 -------- .github/workflows/qa-wip.yml | 8 -------- 2 files changed, 16 deletions(-) diff --git a/.github/workflows/qa-suite.yml b/.github/workflows/qa-suite.yml index af663878e..c9373cac4 100644 --- a/.github/workflows/qa-suite.yml +++ b/.github/workflows/qa-suite.yml @@ -49,14 +49,6 @@ jobs: run: | sudo apt-get update sudo apt-get install -y chromium-browser - - name: Install ChromeDriver - run: | - CHROMIUM_VERSION=$(chromium-browser --version | grep -oE "[0-9\.]{2,}" | cut -d'.' -f1-3) - CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROMIUM_VERSION") - curl -L -O "https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip" - unzip chromedriver_linux64.zip - sudo mv chromedriver /usr/bin/chromedriver - sudo chmod +x /usr/bin/chromedriver - name: Install Swirl run: ./install.sh - name: Update apt diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index f8ae3d407..bd21bfffa 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -27,14 +27,6 @@ jobs: run: | sudo apt-get update sudo apt-get install -y chromium-browser - - name: Install ChromeDriver - run: | - CHROMIUM_VERSION=$(chromium-browser --version | grep -oE "[0-9\.]{2,}" | cut -d'.' -f1-3) - CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROMIUM_VERSION") - curl -L -O "https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip" - unzip chromedriver_linux64.zip - sudo mv chromedriver /usr/bin/chromedriver - sudo chmod +x /usr/bin/chromedriver - name: Install Swirl run: ./install.sh - name: Update apt From 7bcc481d8af161785006e5c1cdf4a67852a9a4ed Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 19:46:09 -0500 Subject: [PATCH 32/44] trying new action for installing chrome --- .github/workflows/qa-wip.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index bd21bfffa..1b3235fe3 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -23,10 +23,10 @@ jobs: with: python-version: '3.12.2' cache: 'pip' # caching pip stuff - - name: Install Chromium - run: | - sudo apt-get update - sudo apt-get install -y chromium-browser + - name: Install Chrome + uses: browser-actions/setup-chrome@v1 + - name: Print the Chrome Version + run: chrome --version - name: Install Swirl run: ./install.sh - name: Update apt From 62c042d27ca25798952862e1526193630a0b50a5 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 20:09:34 -0500 Subject: [PATCH 33/44] more chrom(e)ium setup stuff --- .github/workflows/qa-wip.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 1b3235fe3..de0cc9110 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -25,8 +25,11 @@ jobs: cache: 'pip' # caching pip stuff - name: Install Chrome uses: browser-actions/setup-chrome@v1 - - name: Print the Chrome Version - run: chrome --version + - name: Print the Installed Version and Path + run: | + chrome --version + chromium --version + ${{ steps.setup-chrome.outputs.chrome-path }} --version - name: Install Swirl run: ./install.sh - name: Update apt From bbc89df5d33d1b46ef7dac2e8844b30fbbe5e067 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 20:11:34 -0500 Subject: [PATCH 34/44] fix syntax error --- .github/workflows/qa-wip.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index de0cc9110..7d562f925 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -29,7 +29,7 @@ jobs: run: | chrome --version chromium --version - ${{ steps.setup-chrome.outputs.chrome-path }} --version + ${{ setup-chrome.outputs.chrome-path }} --version - name: Install Swirl run: ./install.sh - name: Update apt From eb7cd2fec8396719591cb7905e0e713e48395e86 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Thu, 29 Feb 2024 20:13:04 -0500 Subject: [PATCH 35/44] remove syntax error --- .github/workflows/qa-wip.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 7d562f925..5394cdec7 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -25,11 +25,10 @@ jobs: cache: 'pip' # caching pip stuff - name: Install Chrome uses: browser-actions/setup-chrome@v1 - - name: Print the Installed Version and Path + - name: Print the Installed Version run: | chrome --version chromium --version - ${{ setup-chrome.outputs.chrome-path }} --version - name: Install Swirl run: ./install.sh - name: Update apt From 7a9ecb32ace34b2f9105ce11fb61820413d73545 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Fri, 1 Mar 2024 09:07:15 -0500 Subject: [PATCH 36/44] update new chromium install action step --- .github/workflows/qa-wip.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 5394cdec7..4d3253ad9 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -23,12 +23,8 @@ jobs: with: python-version: '3.12.2' cache: 'pip' # caching pip stuff - - name: Install Chrome + - name: Install Chromium uses: browser-actions/setup-chrome@v1 - - name: Print the Installed Version - run: | - chrome --version - chromium --version - name: Install Swirl run: ./install.sh - name: Update apt From b52cfcea44ee14fccb40786af0d0caa3b440cf61 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Fri, 1 Mar 2024 12:10:38 -0500 Subject: [PATCH 37/44] clean-up chromium install in qa workflows on main --- .github/workflows/qa-suite.yml | 6 ++++-- .github/workflows/qa-wip.yml | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/qa-suite.yml b/.github/workflows/qa-suite.yml index c9373cac4..6d60bf6f6 100644 --- a/.github/workflows/qa-suite.yml +++ b/.github/workflows/qa-suite.yml @@ -46,9 +46,11 @@ jobs: python-version: '3.12.2' cache: 'pip' # caching pip stuff - name: Install Chromium + uses: browser-actions/setup-chrome@v1 + - name: Chromium Install Details run: | - sudo apt-get update - sudo apt-get install -y chromium-browser + chromium --version + which chromium - name: Install Swirl run: ./install.sh - name: Update apt diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index 4d3253ad9..caa964bc9 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -25,6 +25,10 @@ jobs: cache: 'pip' # caching pip stuff - name: Install Chromium uses: browser-actions/setup-chrome@v1 + - name: Chromium Install Details + run: | + chromium --version + which chromium - name: Install Swirl run: ./install.sh - name: Update apt From e2af2f4274ff86be1626c7f3b225969525507646 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sat, 2 Mar 2024 17:13:32 -0500 Subject: [PATCH 38/44] revert dockfile changes; remove scipts/ dir copy --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 91b667443..ab55c651f 100755 --- a/Dockerfile +++ b/Dockerfile @@ -46,7 +46,6 @@ COPY --from=swirlai/spyglass:latest /usr/src/spyglass/ui/config-swirl-demo.db.js ADD ./swirl_server /app/swirl_server ADD ./SearchProviders /app/SearchProviders -ADD ./scripts /app/scripts ADD ./Data /app/Data ADD ./swirl.py /app/swirl.py ADD ./swirl_load.py /app/swirl_load.py From 68f2b026d09506832f9d350e369ffbed3fcca153 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sat, 2 Mar 2024 17:47:25 -0500 Subject: [PATCH 39/44] restoring Dockerfile to its orginal state; adding csrf to .env.docker --- .env.docker | 1 + Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/.env.docker b/.env.docker index 341154cd0..d943d34c2 100644 --- a/.env.docker +++ b/.env.docker @@ -13,3 +13,4 @@ MICROSOFT_CLIENT_SECRET='' MICROSOFT_REDIRECT_URI='http://localhost:8000/swirl/microsoft-callback' CELERY_BROKER_URL='redis://redis:6379/0' CELERY_RESULT_BACKEND='redis://redis:6379/0' +CSRF_TRUSTED_ORIGINS='http://localhost:8000' diff --git a/Dockerfile b/Dockerfile index ab55c651f..91b667443 100755 --- a/Dockerfile +++ b/Dockerfile @@ -46,6 +46,7 @@ COPY --from=swirlai/spyglass:latest /usr/src/spyglass/ui/config-swirl-demo.db.js ADD ./swirl_server /app/swirl_server ADD ./SearchProviders /app/SearchProviders +ADD ./scripts /app/scripts ADD ./Data /app/Data ADD ./swirl.py /app/swirl.py ADD ./swirl_load.py /app/swirl_load.py From 5e72354edef22136d0336612c8868e984b3bff70 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sun, 3 Mar 2024 13:02:21 -0500 Subject: [PATCH 40/44] remove duplicated code checkout from main version --- .github/workflows/docker-image-sa.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml index da0b55e11..14c3e28c1 100644 --- a/.github/workflows/docker-image-sa.yml +++ b/.github/workflows/docker-image-sa.yml @@ -45,8 +45,6 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ env.branch }} - - name: Checkout the code - uses: actions/checkout@v4 - name: Login to Docker Hub uses: docker/login-action@v3 with: From 3a954fc68c64f30de92c07a795d3ed425d2b2031 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sun, 3 Mar 2024 13:41:19 -0500 Subject: [PATCH 41/44] remove old troubleshooting bits from main version --- .github/workflows/qa-wip.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/qa-wip.yml b/.github/workflows/qa-wip.yml index caa964bc9..27211970b 100644 --- a/.github/workflows/qa-wip.yml +++ b/.github/workflows/qa-wip.yml @@ -48,7 +48,7 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} - name: Run the WIP Tests Suite run: docker run --net=host -t swirlai/swirl-search-qa:wip-tests sh -c "behave --tags=${{ github.event.inputs.behave_tags }}" - - name: Upload Log Files and Screenshots + - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 with: @@ -56,5 +56,4 @@ jobs: path: | logs/ /var/log/syslog* - screenshots/*.png \ No newline at end of file From a050750d08443e68866656cc461b5fdd32cf6a90 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Sun, 3 Mar 2024 13:47:24 -0500 Subject: [PATCH 42/44] remove extra line --- .github/workflows/docker-image-sa.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml index 14c3e28c1..aad5648eb 100644 --- a/.github/workflows/docker-image-sa.yml +++ b/.github/workflows/docker-image-sa.yml @@ -66,7 +66,6 @@ jobs: run: docker buildx create --name devBuilder --use --bootstrap - name: Build the Docker Image run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:latest-sa --push . - - name: Upload log files if: always() uses: actions/upload-artifact@v4 From eafbce3db5a51eeb7a77a3d8c9565a6404958135 Mon Sep 17 00:00:00 2001 From: Erik Spears Date: Mon, 4 Mar 2024 11:10:07 -0500 Subject: [PATCH 43/44] update how main SA docker build handles the branch env var --- .github/workflows/docker-image-sa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml index aad5648eb..65ca28272 100644 --- a/.github/workflows/docker-image-sa.yml +++ b/.github/workflows/docker-image-sa.yml @@ -65,7 +65,7 @@ jobs: - name: Builder Bootstrap run: docker buildx create --name devBuilder --use --bootstrap - name: Build the Docker Image - run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:latest-sa --push . + run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:${{ env.branch }}-sa --push . - name: Upload log files if: always() uses: actions/upload-artifact@v4 From 3d83948da7f740f8e3c5285ab425ada3f266d9f1 Mon Sep 17 00:00:00 2001 From: Erik Spears <98238295+erikspears@users.noreply.github.com> Date: Mon, 4 Mar 2024 13:38:22 -0500 Subject: [PATCH 44/44] Revert "update how main SA docker build handles the branch env var" --- .github/workflows/docker-image-sa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image-sa.yml b/.github/workflows/docker-image-sa.yml index 65ca28272..aad5648eb 100644 --- a/.github/workflows/docker-image-sa.yml +++ b/.github/workflows/docker-image-sa.yml @@ -65,7 +65,7 @@ jobs: - name: Builder Bootstrap run: docker buildx create --name devBuilder --use --bootstrap - name: Build the Docker Image - run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:${{ env.branch }}-sa --push . + run: docker buildx build -f DevUtils/docker/Dockerfile.sa --platform linux/amd64,linux/arm64 --tag swirlai/swirl-search:latest-sa --push . - name: Upload log files if: always() uses: actions/upload-artifact@v4