diff --git a/.circleci/base_config.yml b/.circleci/base_config.yml index c6b002c3a1..8ed62fa185 100644 --- a/.circleci/base_config.yml +++ b/.circleci/base_config.yml @@ -28,7 +28,7 @@ commands: git clone --depth 1 --branch $branch_name --recurse-submodules --shallow-submodules --jobs 8 https://github.com/arangodb/arangodb.git /root/project - if [ "<< parameters.version >>" = "3.10" ] || [ "<< parameters.version >>" = "3.11" ]; then + if [ "<< parameters.version >>" = "3.10" ] || [ "<< parameters.version >>" = "3.11" || [ "<< parameters.version >>" = "oem" ]; then ENTERPRISE_BRANCH="<< parameters.version >>" else ENTERPRISE_BRANCH="devel" @@ -131,7 +131,7 @@ commands: set +e if [ "<< parameters.version >>" = "3.10" ]; then cmake --preset enterprise-pr -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld" -DCMAKE_LIBRARY_PATH=$OPENSSL_ROOT_DIR/lib -DUSE_MAINTAINER_MODE=Off -DUSE_GOOGLE_TESTS=Off -DUSE_FAILURE_TESTS=Off - elif [ "<< parameters.version >>" = "3.11" ]; then + elif [ "<< parameters.version >>" = "3.11" ] || [ "<< parameters.version >>" = "oem" ]; then # The OpenSSL dir that CMake discovers needs to be adjacent to where ldap.h is located, here: /opt cmake --preset enterprise-pr -DCMAKE_C_COMPILER=/tools/clang -DCMAKE_CXX_COMPILER=/tools/clang++ -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=lld" -DCMAKE_LIBRARY_PATH=$OPENSSL_ROOT_DIR/lib -DOPENSSL_ROOT_DIR=/opt -DUSE_MAINTAINER_MODE=Off -DUSE_GOOGLE_TESTS=Off -DUSE_FAILURE_TESTS=Off else diff --git a/.circleci/config.yml b/.circleci/config.yml index 23bd8011e4..977b7d07cf 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -31,6 +31,10 @@ parameters: type: string default: "undefined" + arangodb-oem: + type: string + default: "undefined" + arangodb-3_10: type: string default: "undefined" @@ -116,7 +120,7 @@ jobs: python3 generate_config.py \ --workflow << pipeline.parameters.workflow >> \ - --arangodb-branches << pipeline.parameters.arangodb-3_10 >> << pipeline.parameters.arangodb-3_11 >> << pipeline.parameters.arangodb-3_12 >> << pipeline.parameters.arangodb-3_13 >> \ + --arangodb-branches << pipeline.parameters.arangodb-oem >> << pipeline.parameters.arangodb-3_10 >> << pipeline.parameters.arangodb-3_11 >> << pipeline.parameters.arangodb-3_12 >> << pipeline.parameters.arangodb-3_13 >> \ --arangodb-branch << pipeline.parameters.arangodb-branch >> \ --release-type << pipeline.parameters.release-type >> \ --docs-version << pipeline.parameters.docs-version >> \ diff --git a/.circleci/generate_config.py b/.circleci/generate_config.py index c687e1546e..6d0aa759a1 100644 --- a/.circleci/generate_config.py +++ b/.circleci/generate_config.py @@ -135,7 +135,7 @@ def workflow_generate(config): } }) - if version in ["3.10", "3.11"]: + if version in ["3.10", "3.11", "oem"]: if openssl.startswith("3.0"): compileJob["compile-linux"]["build-image"] = "arangodb/build-alpine-x86_64:3.16-gcc11.2-openssl3.0.10" elif openssl.startswith("3.1"): @@ -191,7 +191,7 @@ def workflow_generate_scheduled(config): "compile-linux": { "context": ["sccache-aws-bucket"], "name": f"compile-{version}", - "arangodb-branch": f"arangodb/enterprise-preview:{version}-nightly" if version in ["3.10", "3.11"] else "arangodb/enterprise-preview:devel-nightly", # TODO: Any other 3.12.x image we could use? + "arangodb-branch": f"arangodb/enterprise-preview:{version}-nightly" if version in ["3.10", "3.11", "oem"] else "arangodb/enterprise-preview:devel-nightly", # TODO: Any other 3.12.x image we could use? "version": version } } @@ -241,7 +241,7 @@ def workflow_release_arangodb(config): } } - if args.docs_version in ["3.10", "3.11"]: + if args.docs_version in ["3.10", "3.11", "oem"]: if openssl.startswith("3.0"): compileJob["compile-linux"]["build-image"] = "arangodb/build-alpine-x86_64:3.16-gcc11.2-openssl3.0.10" elif openssl.startswith("3.1"): @@ -249,6 +249,7 @@ def workflow_release_arangodb(config): elif openssl.startswith("1.1"): compileJob["compile-linux"]["build-image"] = "arangodb/build-alpine-x86_64:3.16-gcc11.2-openssl1.1.1s" else: + # TODO: OEM might need a separate image compileJob["compile-linux"]["build-image"] = "arangodb/ubuntubuildarangodb-devel:9" # 3.11.13 else: # build image for 3.12.4 and devel as of 2024-11-25 compileJob["compile-linux"]["build-image"] = "arangodb/ubuntubuildarangodb-devel:9" @@ -307,7 +308,7 @@ def workflow_generate_launch_command(config): branch = args.arangodb_branches[i] if args.workflow != "generate": #generate scheduled etc. - branch = f"arangodb/enterprise-preview:{version}-nightly" if version in ["3.10", "3.11"] else "arangodb/enterprise-preview:devel-nightly" # TODO: Any other 3.12.x image we could use? + branch = f"arangodb/enterprise-preview:{version}-nightly" if version in ["3.10", "3.11", "oem"] else "arangodb/enterprise-preview:devel-nightly" # TODO: Any other 3.12.x image we could use? if branch == "undefined": continue diff --git a/CIRCLECI.md b/CIRCLECI.md index fe30a414ff..7e4ba092d6 100644 --- a/CIRCLECI.md +++ b/CIRCLECI.md @@ -75,6 +75,7 @@ arguments are invoked: | Parameter type | Name | Value | |:---------------|:-----|:------| | string | `workflow` | `generate` | +| string | `arangodb-oem` | [Upstream reference](#upstream-references) for OEM LTS | | string | `arangodb-3_10` | [Upstream reference](#upstream-references) for 3.10 | | string | `arangodb-3_11` | [Upstream reference](#upstream-references) for 3.11 | | string | `arangodb-3_12` | [Upstream reference](#upstream-references) for 3.12 | @@ -93,6 +94,7 @@ arguments are invoked: | Parameter type | Name | Value | |:---------------|:-----|:------| | string | `workflow` | `generate` | +| string | `arangodb-oem` | [Upstream reference](#upstream-references) for OEM LTS | | string | `arangodb-3_10` | [Upstream reference](#upstream-references) for 3.10 | | string | `arangodb-3_11` | [Upstream reference](#upstream-references) for 3.11 | | string | `arangodb-3_12` | [Upstream reference](#upstream-references) for 3.12 | @@ -121,6 +123,7 @@ or for multiple versions. | Parameter type | Name | Value | |:---------------|:-----|:------| | string | `workflow` | `generate` | +| string | `arangodb-oem` | [Upstream reference](#upstream-references) for OEM LTS | | string | `arangodb-3_10` | [Upstream reference](#upstream-references) for 3.10 | | string | `arangodb-3_11` | [Upstream reference](#upstream-references) for 3.11 | | string | `arangodb-3_12` | [Upstream reference](#upstream-references) for 3.12 | @@ -202,6 +205,7 @@ Invoke Args: | Parameter type | Name | Value | |:---------------|:-----|:------| | string | `workflow` | `generate-scheduled` | +| string | `arangodb-oem` | `arangodb/enterprise-preview:oem-nightly` | | string | `arangodb-3_10` | `arangodb/enterprise-preview:3.10-nightly` | | string | `arangodb-3_11` | `arangodb/enterprise-preview:3.11-nightly` | | string | `arangodb-3_12` | `arangodb/enterprise-preview:devel-nightly` | @@ -219,6 +223,7 @@ Invoke Args: | Parameter type | Name | Value | |:---------------|:-----|:------| | string | `workflow` | `generate-oasisctl` | +| string | `arangodb-oem` | `arangodb/enterprise-preview:oem-nightly` | | string | `arangodb-3_10` | `arangodb/enterprise-preview:3.10-nightly` | | string | `arangodb-3_11` | `arangodb/enterprise-preview:3.11-nightly` | | string | `arangodb-3_12` | `arangodb/enterprise-preview:devel-nightly` | diff --git a/site/content/arangodb/oem/_index.md b/site/content/arangodb/oem/_index.md new file mode 100644 index 0000000000..a5b8d68f16 --- /dev/null +++ b/site/content/arangodb/oem/_index.md @@ -0,0 +1,42 @@ +--- +title: Recommended Resources +menuTitle: '3.11' +weight: 98 +layout: default +--- +{{< cloudbanner >}} + +{{< cards >}} + +{{% card title="What is ArangoDB?" link="about/" %}} +Get to know graphs, ArangoDB's use cases and features. +{{% /card %}} + +{{% card title="Get started" link="get-started/" %}} +Learn about ArangoDB's core concepts, how to interact with the database system, +and get a server instance up and running. +{{% /card %}} + +{{% card title="Arango Managed Platform (AMP)" link="amp/" %}} +Try out Arango's fully-managed cloud offering for a faster time to value. +{{% /card %}} + +{{% card title="AQL" link="aql/" %}} +ArangoDB's Query Language AQL lets you use graphs, JSON documents, and search +via a single, composable query language. +{{% /card %}} + +{{% card title="Data Science" link="data-science/" %}} +Discover the graph analytics and machine learning features of ArangoDB. +{{% /card %}} + +{{% card title="Deploy" link="deploy/" %}} +Find the right deployment mode and set up your ArangoDB instance. +{{% /card %}} + +{{% card title="Develop" link="develop/" %}} +See the in-depth feature and API documentation to start developing applications +with ArangoDB as your backend. +{{% /card %}} + +{{< /cards >}} diff --git a/site/content/arangodb/oem/about/_index.md b/site/content/arangodb/oem/about/_index.md new file mode 100644 index 0000000000..62ade93bbb --- /dev/null +++ b/site/content/arangodb/oem/about/_index.md @@ -0,0 +1,75 @@ +--- +title: What is ArangoDB? +menuTitle: About ArangoDB +weight: 5 +description: >- + ArangoDB is a scalable graph database system to drive value from connected + data, faster +aliases: + - introduction + - introduction/about-arangodb +--- +![ArangoDB Overview Diagram](../../../images/arangodb-overview-diagram.png) + +ArangoDB combines the analytical power of native graphs with an integrated +search engine, JSON support, and a variety of data access patterns via a single, +composable query language. + +ArangoDB is available in an open-source and a commercial [edition](features/_index.md). +You can use it for on-premises deployments, as well as a fully managed +cloud service, the [Arango Managed Platform (AMP)](../../../amp/_index.md). + +## What are Graphs? + +Graphs are information networks comprised of nodes and relations. + +![Node - Relation - Node](../../../images/data-model-graph-relation-abstract.png) + +A social network is a common example of a graph. People are represented by nodes +and their friendships by relations. + +![Mary - is friend of - John](../../../images/data-model-graph-relation-concrete.png) + +Nodes are also called vertices (singular: vertex), and relations are edges that +connect vertices. +A vertex typically represents a specific entity (a person, a book, a sensor +reading, etc.) and an edge defines how one entity relates to another. + +![Mary - bought - Book, is friend of - John](../../../images/data-model-graph-relations.png) + +This paradigm of storing data feels natural because it closely matches the +cognitive model of humans. It is an expressive data model that allows you to +represent many problem domains and solve them with semantic queries and graph +analytics. + +## Beyond Graphs + +Not everything is a graph use case. ArangoDB lets you equally work with +structured, semi-structured, and unstructured data in the form of schema-free +JSON objects, without having to connect these objects to form a graph. + +![Person Mary, Book ArangoDB](../../../images/data-model-document.png) + + + +Depending on your needs, you may mix graphs and unconnected data. +ArangoDB is designed from the ground up to support multiple data models with a +single, composable query language. + +```aql +FOR book IN Books + FILTER book.title == "ArangoDB" + FOR person IN 2..2 INBOUND book Sales, OUTBOUND People + RETURN person.name +``` + +ArangoDB also comes with an integrated search engine for information retrieval, +such as full-text search with relevance ranking. + +ArangoDB is written in C++ for high performance and built to work at scale, in +the cloud or on-premises. + + diff --git a/site/content/arangodb/oem/about/features/_index.md b/site/content/arangodb/oem/about/features/_index.md new file mode 100644 index 0000000000..4857b274b4 --- /dev/null +++ b/site/content/arangodb/oem/about/features/_index.md @@ -0,0 +1,126 @@ +--- +title: Features and Capabilities +menuTitle: Features +weight: 20 +description: >- + ArangoDB is a graph database with a powerful set of features for data management and analytics, + supported by a rich ecosystem of integrations and drivers +aliases: + - ../introduction/features +--- +## On-premises versus Cloud + +### Fully managed cloud service + +The fully managed multi-cloud +[Arango Managed Platform (AMP)](https://dashboard.arangodb.cloud/home?utm_source=docs&utm_medium=cluster_pages&utm_campaign=docs_traffic) +is the easiest and fastest way to get started. It runs the Enterprise Edition +of ArangoDB, lets you deploy clusters with just a few clicks, and is operated +by a dedicated team of ArangoDB engineers day and night. You can choose from a +variety of support plans to meet your needs. + +- Supports many of the AWS and GCP cloud deployment regions +- High availability featuring multi-region zone clusters, managed backups, + and zero-downtime upgrades +- Integrated monitoring, alerting, and log management +- Highly secure with encryption at transit and at rest +- Includes elastic scalability for all deployment models (OneShard and Sharded clusters) + +To learn more, go to the [AMP documentation](../../../../amp/_index.md). + +### Self-managed in the cloud + +ArangoDB can be self-deployed on AWS or other cloud platforms, too. However, when +using a self-managed deployment, you take full control of managing the resources +needed to run it in the cloud. This involves tasks such as configuring, +provisioning, and monitoring the system. For more details, see +[self-deploying ArangoDB in the cloud](../../deploy/in-the-cloud.md). + +ArangoDB supports Kubernetes through its official +[Kubernetes Operator](../../deploy/kubernetes.md) that allows you to easily +deploy and manage clusters within a Kubernetes environment. + +### On-premises + +Running ArangoDB on-premises means that ArangoDB is installed locally, on your +organization's computers and servers, and involves managing all the necessary +resources within the organization's environment, rather than using external +services. + +You can install ArangoDB locally by downloading and running the +[official packages](https://arangodb.com/download/) or run it using +[Docker images](../../operations/installation/docker.md). + +You can deploy it on-premises as a +[single server](../../deploy/single-instance/_index.md) +or as a [cluster](../../deploy/cluster/_index.md) +comprised of multiple nodes with synchronous replication and automatic failover +for high availability and resilience. For the highest level of data safety, +you can additionally set up off-site replication for your entire cluster +([Datacenter-to-Datacenter Replication](../../deploy/arangosync/_index.md)). + +ArangoDB also integrates with Kubernetes, offering a +[Kubernetes Operator](../../deploy/kubernetes.md) that lets you deploy in your +Kubernetes cluster. + +## ArangoDB Editions + +### Community Edition + +ArangoDB is freely available in a **Community Edition** under the Apache 2.0 +open-source license. It is a fully-featured version without time or size +restrictions and includes cluster support. + +- Open source under a permissive license +- One database core for all graph, document, key-value, and search needs +- A single composable query language for all data models +- Extensible through microservices with custom REST APIs and user-definable + query functions +- Cluster deployments for high availability and resilience + +See all [Community Edition Features](community-edition.md). + +### Enterprise Edition + +ArangoDB is also available in a commercial version, called the +**Enterprise Edition**. It includes additional features for performance and +security, such as for scaling graphs and managing your data safely. + +- Includes all Community Edition features +- Performance options to smartly shard and replicate graphs and datasets for + optimal data locality +- Multi-tenant deployment option for the transactional guarantees and + performance of a single server +- Enhanced data security with on-disk and backup encryption, key rotation, + audit logging, and LDAP authentication +- Incremental backups without downtime and off-site replication + +See all [Enterprise Edition Features](enterprise-edition.md). + +### Differences between the Editions + +| Community Edition | Enterprise Edition | +|-------------------|--------------------| +| Apache 2.0 License | Commercial License | +| Sharding using consistent hashing on the default or custom shard keys | In addition, **smart sharding** for improved data locality | +| Only hash-based graph sharding | **SmartGraphs** to intelligently shard large graph datasets and **EnterpriseGraphs** with an automatic sharding key selection | +| Only regular collection replication without data locality optimizations | **SatelliteCollections** to replicate collections on all cluster nodes and data locality optimizations for queries | +| No optimizations when querying sharded graphs and replicated collections together | **SmartGraphs using SatelliteCollections** to enable more local execution of graph queries | +| Only regular graph replication without local execution optimizations | **SatelliteGraphs** to execute graph traversals locally on a cluster node | +| Collections can be sharded alike but joins do not utilize co-location | **SmartJoins** for co-located joins in a cluster using identically sharded collections | +| Graph traversals without parallel execution | **Parallel execution of traversal queries** with many start vertices | +| Graph traversals always load full documents | **Traversal projections** optimize the data loading of AQL traversal queries if only a few document attributes are accessed | +| Iterative graph processing (Pregel) for single servers | **Pregel graph processing for clusters** and single servers | +| Inverted indexes and Views without support for search highlighting and nested search | **Search highlighting** for getting the substring positions of matches and **nested search** for matching arrays with all the conditions met by a single object | +| Only standard Jaccard index calculation | **Jaccard similarity approximation** with MinHash for entity resolution, such as for finding duplicate records, based on how many common elements they have |{{% comment %}} Experimental feature +| No fastText model support | Classification of text tokens and finding similar tokens using supervised **fastText word embedding models** | +{{% /comment %}} +| Only regular cluster deployments | **OneShard** deployment option to store all collections of a database on a single cluster node, to combine the performance of a single server and ACID semantics with a fault-tolerant cluster setup | +| ACID transactions for multi-document / multi-collection queries on single servers, for single document operations in clusters, and for multi-document queries in clusters for collections with a single shard | In addition, ACID transactions for multi-collection queries using the OneShard feature | +| Always read from leader shards in clusters | Optionally allow dirty reads to **read from followers** to scale reads | +| TLS key and certificate rotation | In addition, **key rotation for JWT secrets** and **server name indication** (SNI) | +| Built-in user management and authentication | Additional **LDAP authentication** option | +| Only server logs | **Audit log** of server interactions | +| No on-disk encryption | **Encryption at Rest** with hardware-accelerated on-disk encryption and key rotation | +| Only regular backups | **Datacenter-to-Datacenter Replication** for disaster recovery | +| Only unencrypted backups and basic data masking for backups | **Hot Backups**, **encrypted backups**, and **enhanced data masking** for backups | diff --git a/site/content/arangodb/oem/about/features/community-edition.md b/site/content/arangodb/oem/about/features/community-edition.md new file mode 100644 index 0000000000..9953335cf1 --- /dev/null +++ b/site/content/arangodb/oem/about/features/community-edition.md @@ -0,0 +1,283 @@ +--- +title: Community Edition Features +menuTitle: Community Edition +weight: 5 +description: >- + The open-source version of ArangoDB is available under the permissive + Apache 2.0 license and offers an extensive feature set including cluster + support for free +aliases: + - ../../introduction/features/community-edition +--- +The Community Edition features are outlined below. For additional information, +see [arangodb.com/community-server/](https://www.arangodb.com/community-server/). + +## General + +- [**Graph Database**](../../concepts/data-models.md#graph-model): + Native support for storing and querying graphs comprised of vertices and edges. + You can model complex domains because edges are documents without any + restrictions in complexity. + +- [**Document Database**](../../concepts/data-models.md#document-model): + A modern document database system that allows you to model data intuitively + and evolve the data model easily. Documents can be organized in collections, + and collections in databases for multi-tenancy. + +{{% comment %}} + TODO: Add a bullet point for multi-model? (unified query language, lower TCO, ...) +{{% /comment %}} + +- [**Data Format**](../../concepts/data-structure/_index.md#documents): + JSON, internally stored in a binary format invented by ArangoDB called + VelocyPack. + +- **Schema-free**: + Flexible data modeling without having to define a schema upfront. + Model your data as combination of key-value pairs, + documents, or graphs - perfect for social relations. Optional document + validation using JSON Schema (draft-4, without remote schema support). + +- [**Data Storage**](../../components/arangodb-server/storage-engine.md): + RocksDB storage engine to persist data and indexes on disk, with a hot set in + memory. It uses journaling (write-ahead logging) and can take advantage of + modern storage hardware, like SSDs and large caches. + +- [**Computed Values**](../../concepts/data-structure/documents/computed-values.md): + Persistent document attributes that are generated when documents are created + or modified, using an AQL expression. + +- [**In the cloud or on-prem**](../features/_index.md#on-premises-versus-cloud): + Use ArangoDB as a [fully managed service](https://dashboard.arangodb.cloud/home?utm_source=docs&utm_medium=cluster_pages&utm_campaign=docs_traffic), + self-managed in the cloud, or on-premises. + +- [**Multiple Environments**](../../operations/installation/_index.md#supported-platforms-and-architectures): + Develop and test with ArangoDB on Linux, macOS, and Windows, and run it in + production on Linux. ArangoDB is available for 64-bit ARM chips on macOS and + Linux for evaluation, as well as production-ready for the x86-64 architecture. + +## Scalability & High Availability + +- [**Hash-based sharding**](../../deploy/architecture/data-sharding.md): + Spread bigger datasets across multiple servers using consistent hashing on + the default or custom shard keys. + +- [**Synchronous Replication**](../../deploy/cluster/_index.md#synchronous-replication): + Data changes are propagated to other cluster nodes immediately as part of an + operation, and are only considered successful when the configured number of writes + is reached. Synchronous replication works on a per-shard basis. For each + collection, you can configure how many copies of each shard are kept in the cluster. + +- [**Active Failover**](../../deploy/active-failover/_index.md): + Run a single server with asynchronous replication to one or more passive + single servers for automatic failover. + +- [**Automatic Failover Cluster**](../../deploy/cluster/_index.md#automatic-failover): + If a node goes down, another node takes over to avoid any downtime. + +{{% comment %}} + TODO: - **Master/Master Conflict Resolution**: What does this refer to? How does it work? MVCC? +{{% /comment %}} + +- **Load-Balancer Support**: + Round-robin load-balancer support for cloud environments. + +- **High-performance Request Handling**: + Low-latency request handling using a boost-ASIO server infrastructure. + +## Querying + +- [**Declarative Query Language for All Data Models**](../../aql/_index.md): + Powerful query language (AQL) to retrieve and modify data. + Graph traversals, full-text searches, geo-spatial queries, and aggregations + can be composed in a single query. + Support for sliding window queries to aggregate adjacent documents, value + ranges and time intervals. + Cluster-distributed aggregation queries. + +- [**Query Optimizer**](../../aql/execution-and-performance/query-optimization.md): + Cost-based query optimizer that takes index selectivity estimates into account. + + +- [**Query Profiling**](../../aql/execution-and-performance/query-profiling.md): + Show detailed runtime information for AQL queries. + +- [**Upsert Operations**](../../aql/examples-and-query-patterns/upsert-repsert-guide.md): + Support for insert-or-update (upsert), insert-or-replace (repsert), and + insert-or-ignore requests, that result in one or the other operation depending + on whether the target document exists already. + +- **Graph Relations**: + Edges can connect vertex and even edge documents to express complex m:n + relations with any depth, creating graphs and hyper-graphs. + + +- [**Relational Joins**](../../aql/examples-and-query-patterns/joins.md): + Joins similar to those in relational database systems can be leveraged to + match up documents from different collections, allowing normalized data models. + +- **Advanced Path-Finding with Multiple Algorithms**: + Graphs can be [traversed](../../aql/graphs/traversals-explained.md) with AQL to + retrieve direct and indirect neighbor nodes using a fixed or variable depth. + The [traversal order](../../aql/graphs/traversals.md) can be + depth-first, breadth-first, or in order of increasing edge weights + ("Weighted Traversals"). Stop conditions for pruning paths are supported. + Traversal algorithms to get a [shortest path](../../aql/graphs/shortest-path.md), + [all shortest paths](../../aql/graphs/all-shortest-paths.md), paths in order of + increasing length ("[k Shortest Paths](../../aql/graphs/k-shortest-paths.md)"), + and to enumerate all paths between two vertices + ("[k Paths](../../aql/graphs/k-paths.md)") are available, too. + +- [**Pregel**](../../data-science/pregel/_index.md): + Iterative graph processing for single servers with pre-built algorithms like + PageRank, Connected Components, and Label Propagation. Cluster support + requires the Enterprise Edition. + +- [**ArangoSearch for Text Search and Ranking**](../../index-and-search/arangosearch/_index.md): + A built-in search engine for full-text, complex data structures, and more. + Exact value matching, range queries, prefix matching, case-insensitive and + accent-insensitive search. Token, phrase, wildcard, and fuzzy search support + for full-text. Result ranking using Okapi BM25 and TF-IDF. + Geo-spatial search that can be combined with full-text search. + Flexible data field pre-processing with custom queries and the ability to + chain built-in and custom Analyzers. Language-agnostic tokenization of text. + +- [**GeoJSON Support**](../../aql/functions/geo.md#geojson): + Geographic data encoded in the popular GeoJSON format can be stored and used + for geo-spatial queries. + +{{% comment %}} Experimental feature +- [**Query result spillover**](../../aql/how-to-invoke-aql/with-arangosh.md#spilloverthresholdmemoryusage) + AQL queries can store intermediate and final results temporarily on disk + (also known as external result sets) to decrease memory usage when a specified + threshold is reached. +{{% /comment %}} + +## Transactions + +- [**AQL Queries**](../../aql/data-queries.md#transactional-execution): + AQL queries are executed transactionally (with exceptions), either committing + or rolling back data modifications automatically. + +- [**Stream Transactions**](../../develop/http-api/transactions/stream-transactions.md): + Transactions with individual begin and commit / abort commands that can span + multiple AQL queries and API calls of supported APIs. + +- [**JavaScript Transactions**](../../develop/http-api/transactions/javascript-transactions.md): + Single-request transactions written in JavaScript that leverage ArangoDB's + JavaScript API. + +- **Multi-Document Transactions**: + Transactions are not limited to single documents, but can involve many + documents of a collection. + +- **Multi-Collection Transactions** + A single transaction can modify the documents of multiple collections. + There is an automatic deadlock detection for single servers. + +- **ACID Transactions**: + Using single servers, multi-document / multi-collection queries are guaranteed + to be fully ACID (atomic, consistent, isolated, durable). + Using cluster deployments, single-document operations are fully ACID, too. + Multi-document queries in a cluster are not ACID, except for collections with + a single shard. Multi-collection queries require the OneShard + feature of the Enterprise Edition to be ACID. + +## Performance + +- [**Persistent Indexes**](../../index-and-search/indexing/basics.md#persistent-index): + Indexes are stored on disk to enable fast server restarts. You can create + secondary indexes over one or multiple fields, optionally with a uniqueness + constraint. A "sparse" option to only index non-null values is also available. + The elements of an array can be indexed individually. + +- [**Inverted indexes**](../../index-and-search/indexing/working-with-indexes/inverted-indexes.md): + An eventually consistent index type that can accelerate a broad range of + queries from simple to complex, including full-text search. + +- [**Vertex-centric Indexes**](../../index-and-search/indexing/basics.md#vertex-centric-indexes): + Secondary indexes for more efficient graph traversals with filter conditions. + +- [**Time-to-Live (TTL) Indexes**](../../index-and-search/indexing/basics.md#ttl-time-to-live-index): + Time-based removal of expired documents. + +- [**Geo-spatial Indexes**](../../index-and-search/indexing/basics.md#geo-index): + Accelerated geo-spatial queries for locations and GeoJSON objects, based on + the S2 library. + Support for composable, distance-based geo-queries ("geo cursors"). + +{{% comment %}} Experimental feature +- [**Multi-dimensional indexes**](../../index-and-search/indexing/working-with-indexes/multi-dimensional-indexes.md): + An index type to efficiently intersect multiple range queries, like finding + all appointments that intersect a time range. +{{% /comment %}} + +- [**Background Indexing**](../../index-and-search/indexing/basics.md#creating-indexes-in-background): + Indexes can be created in the background to not block queries in the meantime. + +- [**Index cache refilling**](../../release-notes/version-3.11/whats-new-in-3-11.md#index-cache-refilling): + In-memory index caches are automatically repopulated after writes that affect + an edge index or cache-enabled persistent indexes to maximize cache hits and + thus query performance. + +- [**Extensive Query Optimization**](../../aql/execution-and-performance/query-optimization.md): + Late document materialization to only fetch the relevant documents from + SORT/LIMIT queries. Early pruning of non-matching documents in full + collection scans. Inlining of certain subqueries to improve execution time. + + +- [**Parallel gather**](../../release-notes/version-3.11/whats-new-in-3-11.md#parallel-gather): + Fast, memory-efficient processing of cluster queries by combining + results in parallel. + +## Extensibility + +- [**Microservice Support with ArangoDB Foxx**](../../develop/foxx-microservices/_index.md): + Use ArangoDB as an application server and fuse your application and database + together for maximal throughput. + With fault-tolerant cluster support. + +- [**Server-Side Functions**](../../aql/user-defined-functions.md): + You can extend AQL with user-defined functions written in JavaScript. + +## Security + +- [**Authentication**](../../operations/administration/user-management/_index.md): + Built-in user management with password- and token-based authentication. + +- **Role-based Access Control**: + ArangoDB supports all basic security requirements. By using ArangoDB's Foxx + microservice framework users can achieve very high security standards + fitting individual needs. + +- [**TLS Encryption**](../../components/arangodb-server/options.md#ssl): + Internal and external communication over encrypted network connections with + TLS (formerly SSL). + [TLS key and certificate rotation](../../release-notes/version-3.7/whats-new-in-3-7.md#tls-key-and-certificate-rotation) + is supported. + +## Administration + +- [**Web-based User Interface**](../../components/web-interface/_index.md): + Graphical UI for your browser to work with ArangoDB. It allows you to + view, create, and modify databases, collections, documents, graphs, etc. + You can also run, explain, and profile AQL queries. Includes a graph viewer + with WebGL support. + +- **Cluster-friendly User Interface**: + View the status of your cluster and its individual nodes, and move and + rebalance shards via the web interface. + +- **[Backup](../../components/tools/arangodump/_index.md) and [Restore](../../components/tools/arangorestore/_index.md) Tools**: + Multi-threaded dumping and restoring of collection settings and data + in JSON format. Data masking capabilities for attributes containing sensitive + data / PII when creating backups. + +- **[Import](../../components/tools/arangoimport/_index.md) and [Export](../../components/tools/arangoexport/_index.md) Tools**: + CLI utilities to load and export data in multiple text-based formats. + You can import from JSON, JSONL, CSV, and TSV files, and export to JSON, JSONL, + CSV, TSV, XML, and XGMML files. + +- [**Metrics**](../../develop/http-api/monitoring/metrics.md): + Monitor the healthiness and performance of ArangoDB servers using the metrics + exported in the Prometheus format. diff --git a/site/content/arangodb/oem/about/features/enterprise-edition.md b/site/content/arangodb/oem/about/features/enterprise-edition.md new file mode 100644 index 0000000000..8e962a4a34 --- /dev/null +++ b/site/content/arangodb/oem/about/features/enterprise-edition.md @@ -0,0 +1,123 @@ +--- +title: Enterprise Edition Features +menuTitle: Enterprise Edition +weight: 10 +description: >- + The commercial version of ArangoDB offers performance, compliance, and + security features for larger or more sensitive datasets, as well as additional + query capabilities +aliases: + - ../../introduction/features/enterprise-edition +--- +The Enterprise Edition has all the features of the +[Community Edition](community-edition.md) and, on top of that, the +features outlined below. For additional information, see +[arangodb.com/enterprise-server/](https://www.arangodb.com/enterprise-server/). + +## Performance + +- [**SmartGraphs**](../../graphs/smartgraphs/_index.md): + Value-based sharding of large graph datasets for better data locality when + traversing graphs. + +- [**EnterpriseGraphs**](../../graphs/enterprisegraphs/_index.md): + A specialized version of SmartGraphs, with an automatic sharding key selection. + +- [**SmartGraphs using SatelliteCollections**](../../graphs/smartgraphs/_index.md): + Collections replicated on all cluster nodes can be combined with graphs + sharded by document attributes to enable more local execution of graph queries. + +- [**SatelliteGraphs**](../../graphs/satellitegraphs/_index.md): + Graphs replicated on all cluster nodes to execute graph traversals locally. + +- [**SatelliteCollections**](../../develop/satellitecollections.md): + Collections replicated on all cluster nodes to execute joins with sharded + data locally. + +- [**SmartJoins**](../../develop/smartjoins.md): + Co-located joins in a cluster using identically sharded collections. + +- [**OneShard**](../../deploy/oneshard.md): + Option to store all collections of a database on a single cluster node, to + combine the performance of a single server and ACID semantics with a + fault-tolerant cluster setup. + +- [**Traversal**](../../release-notes/version-3.7/whats-new-in-3-7.md#traversal-parallelization-enterprise-edition) + [**Parallelization**](../../release-notes/version-3.10/whats-new-in-3-10.md#parallelism-for-sharded-graphs-enterprise-edition): + Parallel execution of traversal queries with many start vertices, leading to + faster results. + +- [**Traversal Projections**](../../release-notes/version-3.10/whats-new-in-3-10.md#traversal-projections-enterprise-edition): + Optimized data loading for AQL traversal queries if only a few document + attributes are accessed. + +- [**Parallel index creation**](../../release-notes/version-3.10/whats-new-in-3-10.md#parallel-index-creation-enterprise-edition): + Non-unique indexes can be created with multiple threads in parallel. + +- [**`minhash` Analyzer**](../../index-and-search/analyzers.md#minhash): + Jaccard similarity approximation for entity resolution, such as for finding + duplicate records, based on how many elements they have in common + +- [**`geo_s2` Analyzer**](../../index-and-search/analyzers.md#geo_s2): + Efficiently index geo-spatial data using different binary formats, tuning the + size on disk, the precision, and query performance. + +- [**ArangoSearch column cache**](../../release-notes/version-3.10/whats-new-in-3-10.md#arangosearch-column-cache-enterprise-edition): + Always cache field normalization values, Geo Analyzer auxiliary data, + stored values, primary sort columns, and primary key columns in memory to + improve the performance of Views and inverted indexes. + +- [**Read from followers in clusters**](../../develop/http-api/documents.md#read-from-followers): + Allow dirty reads so that Coordinators can read from any shard replica and not + only from the leader, for scaling reads. + +## Querying + +- [**Pregel in Cluster**](../../data-science/pregel/_index.md#prerequisites): + Distributed iterative graph analytics for cluster deployments. + +- [**Search highlighting**](../../index-and-search/arangosearch/search-highlighting.md): + Get the substring positions of matched terms, phrases, or _n_-grams. + +- [**Nested search**](../../index-and-search/arangosearch/nested-search.md): + Match arrays of objects with all the conditions met by a single sub-object, + and define for how many of the elements this must be true. + +{{% comment %}} Experimental feature +- **[`classification`](../../index-and-search/analyzers.md#classification) and [`nearest_neighbors` Analyzers](../../index-and-search/analyzers.md#nearest_neighbors)**: + Classification of text tokens and finding similar tokens using supervised + fastText word embedding models. +{{% /comment %}} + +- [**Skip inaccessible collections**](../../aql/how-to-invoke-aql/with-arangosh.md#skipinaccessiblecollections): + Let AQL queries like graph traversals pretend that collections are empty if + the user has no access to them instead of failing the query. + +## Security + +- [**DC2DC**](../../deploy/arangosync/_index.md): + Datacenter-to-Datacenter Replication for disaster recovery. + +- [**Auditing**](../../operations/security/audit-logging.md): + Audit logs of all server interactions. + +- [**LDAP Authentication**](../../components/arangodb-server/ldap.md): + ArangoDB user authentication with an LDAP server. + +- [**Encryption at Rest**](../../operations/security/encryption-at-rest.md): + Hardware-accelerated on-disk encryption for your data. + +- [**Encrypted Backups**](../../components/tools/arangodump/examples.md#encryption): + Data dumps can be encrypted using a strong 256-bit AES block cipher. + +- [**Hot Backups**](../../operations/backup-and-restore.md#hot-backups): + Consistent, incremental data backups without downtime for single servers and clusters. + +- [**Enhanced Data Masking**](../../components/tools/arangodump/maskings.md#masking-functions): + Extended data masking capabilities for attributes containing sensitive data + / PII when creating backups. + +- **Advanced Encryption and Security Configuration**: + Key rotation for [JWT secrets](../../develop/http-api/authentication.md#hot-reload-jwt-secrets) + and [on-disk encryption](../../develop/http-api/security.md#encryption-at-rest), + as well as [Server Name Indication (SNI)](../../components/arangodb-server/options.md#--sslserver-name-indication). diff --git a/site/content/arangodb/oem/about/features/highlights-by-version.md b/site/content/arangodb/oem/about/features/highlights-by-version.md new file mode 100644 index 0000000000..db66538908 --- /dev/null +++ b/site/content/arangodb/oem/about/features/highlights-by-version.md @@ -0,0 +1,448 @@ +--- +title: Highlights by Version +menuTitle: Highlights by Version +weight: 15 +description: >- + The most notable features in the Community and Enterprise Edition of ArangoDB, + grouped by version +aliases: + - ../../introduction/features/highlights-by-version +--- +## Version 3.11 + +**All Editions** + +- [**Parallel gather**](../../release-notes/version-3.11/whats-new-in-3-11.md#parallel-gather): + Faster, more memory-efficient processing of cluster queries by combining + results on Coordinators in parallel. + +- [**Index cache refilling**](../../release-notes/version-3.11/whats-new-in-3-11.md#index-cache-refilling): + Automatically repopulate in-memory index caches after writes that affect an + edge index or cache-enabled persistent indexes to maximize cache hits and thus + query performance. + +**Enterprise Edition** + +- [**ArangoSearch column cache**](../../release-notes/version-3.10/whats-new-in-3-10.md#arangosearch-column-cache-enterprise-edition): + Always cache field normalization values, Geo Analyzer auxiliary data, + stored values, primary sort columns, and primary key columns in memory to + improve the performance of Views and inverted indexes. + +- [**`geo_s2` Analyzer**](../../index-and-search/analyzers.md#geo_s2): + Efficiently index geo-spatial data using different binary formats, tuning the + size on disk, the precision, and query performance. + +Also see [What's New in 3.11](../../release-notes/version-3.11/whats-new-in-3-11.md). + +## Version 3.10 + +**All Editions** + +- [**Native ARM Support**](../../release-notes/version-3.10/whats-new-in-3-10.md#native-arm-support): + Packages for the ARM architecture are now available, including native support + for Apple silicon. + +- [**Computed Values**](../../concepts/data-structure/documents/computed-values.md): + Persistent document attributes that are generated when documents are created + or modified, using an AQL expression. + +- [**Inverted indexes**](../../index-and-search/indexing/working-with-indexes/inverted-indexes.md): + A new, eventually consistent index type that can accelerate a broad range of + queries, providing similar search capabilities as `arangosearch` Views, but + defined per collection and simpler to use. + +- [**`search-alias` Views**](../../release-notes/version-3.10/whats-new-in-3-10.md#search-alias-views): + Add inverted indexes to `search-alias` Views for searching multiple collections + at once, with ranking and search highlighting capabilities, as a lightweight + alternative to `arangosearch` Views. + +- **Persistent indexes**: + An optional [**In-memory Cache**](../../index-and-search/indexing/working-with-indexes/persistent-indexes.md#caching-of-index-values) + for faster lookups and [**Stored Values**](../../index-and-search/indexing/working-with-indexes/persistent-indexes.md#storing-additional-values-in-indexes) + to let persistent indexes cover additional attributes of projections. + +- **AQL Graph Traversals**: + [All Shortest Paths](../../aql/graphs/all-shortest-paths.md) allows you to query + for all paths of shortest length between two documents. + +**Enterprise Edition** + +- [**EnterpriseGraphs**](../../graphs/enterprisegraphs/_index.md): A new specialized version of + SmartGraphs, with an automatic sharding key selection. + +- [**Search highlighting**](../../index-and-search/arangosearch/search-highlighting.md): + Get the substring positions of matched terms, phrases, or _n_-grams. + +- [**Nested search**](../../index-and-search/arangosearch/nested-search.md): + Match arrays of objects with all the conditions met by a single sub-object, + and define for how many of the elements this must be true. + +- **ArangoSearch**: + New [`minhash` Analyzer](../../index-and-search/analyzers.md#minhash) for locality-sensitive hashing + to approximate the Jaccard similarity, with inverted index and + `arangosearch` View support that allows you to implement entity resolution. + +- [**Parallelism for sharded graphs**](../../release-notes/version-3.10/whats-new-in-3-10.md#parallelism-for-sharded-graphs-enterprise-edition): + Parallel execution of AQL traversal queries with many start vertices for all + types of sharded graphs, leading to faster results. + +- [**Traversal Projections**](../../release-notes/version-3.10/whats-new-in-3-10.md#traversal-projections-enterprise-edition): + Optimized data loading for AQL traversal queries if only a few document + attributes are accessed. + +- [**Read from followers in clusters**](../../develop/http-api/documents.md#read-from-followers): + Allow dirty reads so that Coordinators can read from any shard replica and not + only from the leader, for scaling reads. + +Also see [What's New in 3.10](../../release-notes/version-3.10/whats-new-in-3-10.md). + +## Version 3.9 + +**All Editions** + +- **ArangoSearch**: + New [**Segmentation Analyzer**](../../index-and-search/analyzers.md#segmentation) + for language-agnostic tokenization of text. + A [**Collation Analyzer**](../../index-and-search/analyzers.md#collation) + to honor the alphabetical order of the specified language in range queries. + +**Enterprise Edition** + +- [**(Disjoint) SmartGraphs using SatelliteCollections**](../../graphs/smartgraphs/_index.md): + SatelliteCollections can be used in (Disjoint) SmartGraphs to enable more + local execution of graph queries. + +Also see [What's New in 3.9](../../release-notes/version-3.9/whats-new-in-3-9.md). + +## Version 3.8 + +**All Editions** + +- [**Weighted traversals**](../../release-notes/version-3.8/whats-new-in-3-8.md#weighted-traversals) + and [**k Paths**](../../release-notes/version-3.8/whats-new-in-3-8.md#k-paths): + Two new AQL graph traversal methods to emit paths in order of increasing + weights and to enumerate all paths between a source and a target vertex that + match a given length. + +- **ArangoSearch**: + New [**Pipeline Analyzer**](../../index-and-search/analyzers.md#pipeline) + that allows you to combine multiple Analyzers, enabling case-insensitive + _n_-gram-based fuzzy search and more. New + [**AQL Analyzer**](../../index-and-search/analyzers.md#aql) + so that you can use an AQL query to pre-process and filter your data for + indexing. Support for **geo-spatial queries** through new + [Geo](../../index-and-search/analyzers.md#geojson) + [Analyzers](../../index-and-search/analyzers.md#geopoint) and + [ArangoSearch Geo functions](../../aql/functions/arangosearch.md#geo-functions). + A new [**Stop words Analyzer**](../../index-and-search/analyzers.md#stopwords) that + can be used standalone or in an Analyzer pipeline. + +- A [**`WINDOW` operation**](../../aql/high-level-operations/window.md) for aggregations over + adjacent rows, value ranges or time windows. + +**Enterprise Edition** + +- **Encryption at Rest** utilizes + [hardware acceleration](../../release-notes/version-3.8/whats-new-in-3-8.md#encryption-at-rest) + capabilities of modern CPUs. + +Also see [What's New in 3.8](../../release-notes/version-3.8/whats-new-in-3-8.md). + +## Version 3.7 + +**All Editions** + +- **ArangoSearch**: + [Wildcard](../../aql/functions/arangosearch.md#like) and fuzzy search + ([Levenshtein distance](../../aql/functions/arangosearch.md#levenshtein_match) and + [_n_-gram based](../../aql/functions/arangosearch.md#ngram_match)), + enhanced [phrase and proximity search](../../aql/functions/arangosearch.md#phrase), + improved late document materialization and + [Views covering queries](../../release-notes/version-3.7/whats-new-in-3-7.md#covering-indexes) + using their indexes without touching the storage engine, as well as a new + SIMD-based index format for faster processing and + [stemming support](../../release-notes/version-3.7/whats-new-in-3-7.md#stemming-support-for-more-languages) + for 15 additional languages. + +- [**Schema Validation**](../../concepts/data-structure/documents/schema-validation.md): + Enforce a JSON Schema for documents on collection level. Invalid documents + can be rejected automatically by the database system, making it easy to + maintain data quality. + +- [**Insert-Update** and **Insert-Ignore**](../../release-notes/version-3.7/whats-new-in-3-7.md#insert-update-and-insert-ignore): + New document API operations to upsert documents and to efficiently insert + documents while skipping the creation if the document exists already. + +- **AQL**: + Improved [subquery](../../release-notes/version-3.7/whats-new-in-3-7.md#subquery-optimizations) and + [graph traversal performance](../../release-notes/version-3.7/whats-new-in-3-7.md#traversal-optimizations), + among many optimizations and enhancements. + +- [**HTTP/2 support**](../../release-notes/version-3.7/whats-new-in-3-7.md#http2-support): + Better load-balancer and Kubernetes compatibility, improved request throughput. + +**Enterprise Edition** + +- [**SatelliteGraphs**](../../release-notes/version-3.7/whats-new-in-3-7.md#satellitegraphs): + Synchronously replicated graphs with local traversal execution. + +- [**Disjoint SmartGraphs**](../../release-notes/version-3.7/whats-new-in-3-7.md#disjoint-smartgraphs): + Improve traversal execution times for SmartGraphs without edges between + vertices with different SmartGraph attributes. + +- [**Traversal parallelization**](../../release-notes/version-3.7/whats-new-in-3-7.md#traversal-parallelization-enterprise-edition): + Optional parallel execution of nested traversals for single servers and + OneShard clusters. + +- **Security**: + Added support for multiple + [JWT Secrets](../../release-notes/version-3.7/whats-new-in-3-7.md#jwt-secret-rotation-enterprise-edition) + and the ability to hot-reload them from disk, + [TLS key and certificate rotation](../../release-notes/version-3.7/whats-new-in-3-7.md#tls-key-and-certificate-rotation), + [Encryption at rest key rotation](../../release-notes/version-3.7/whats-new-in-3-7.md#encryption-at-rest-key-rotation-enterprise-edition) + and [Server Name Indication (SNI)](../../release-notes/version-3.7/whats-new-in-3-7.md#server-name-indication-enterprise-edition). + +Also see [What's New in 3.7](../../release-notes/version-3.7/whats-new-in-3-7.md). + +## Version 3.6 + +**All Editions** + +- **AQL**: + Improved query performance thanks to + [early pruning](../../release-notes/version-3.6/whats-new-in-3-6.md#early-pruning-of-non-matching-documents), + [subquery splicing](../../release-notes/version-3.6/whats-new-in-3-6.md#subquery-splicing-optimization), + [late document materialization](../../release-notes/version-3.6/whats-new-in-3-6.md#late-document-materialization-rocksdb), + [parallelization](../../release-notes/version-3.6/whats-new-in-3-6.md#parallelization-of-cluster-aql-queries) for certain cluster queries + and more. New server-side [`maxRuntime`](../../aql/how-to-invoke-aql/with-arangosh.md#maxruntime) + option for queries. + +- **ArangoSearch**: + New [Analyzer options](../../release-notes/version-3.6/whats-new-in-3-6.md#analyzers) for + edge _n_-grams (`text` Analyzer), UTF-8 encoded _n_-gram input and optional + start/end markers (`ngram` Analyzer). Support for + [dynamic expressions](../../release-notes/version-3.6/whats-new-in-3-6.md#dynamic-search-expressions-with-arrays) + using arrays (array comparison operators in `SEARCH` queries and the + `TOKENS()` / `PHRASE()` functions accept arrays). Views can benefit from the + SmartJoins optimization. + +**Enterprise Edition** + +- [**OneShard**](../../deploy/oneshard.md) + deployments offer a practicable solution that enables significant performance + improvements by massively reducing cluster-internal communication. A database + created with OneShard enabled is limited to a single DB-Server node but still + replicated synchronously to ensure resilience. This configuration allows + running transactions with ACID guarantees on shard leaders. + +Also see [What's New in 3.6](../../release-notes/version-3.6/whats-new-in-3-6.md). + +## Version 3.5 + +**All Editions** + +- **ArangoSearch**: + The search and ranking engine received an upgrade and now features + [Configurable Analyzers](../../index-and-search/analyzers.md), + [Sorted Views](../../index-and-search/arangosearch/performance.md#primary-sort-order) + and several improvements to the + [AQL integration](../../release-notes/version-3.5/whats-new-in-3-5.md#arangosearch). + +- **AQL Graph Traversals**: + [k Shortest Paths](../../aql/graphs/k-shortest-paths.md) allows you to query not + just for one shortest path between two documents but multiple, sorted by + length or weight. With [PRUNE](../../aql/graphs/traversals.md#pruning) you can + stop walking down certain paths early in a graph traversal to improve its + efficiency. + +- [**Stream Transaction API**](../../develop/http-api/transactions/stream-transactions.md): + Perform multi-document transactions with individual begin and commit / abort + commands using the new HTTP endpoints or via a supported driver. + +- [**Time-to-Live**](../../index-and-search/indexing/basics.md#ttl-time-to-live-index) + [**Indexes**](../../index-and-search/indexing/working-with-indexes/ttl-indexes.md): + TTL indexes can be used to automatically remove documents in collections for + use cases like expiring sessions or automatic purging of statistics or logs. + +- [**Index Hints**](../../aql/high-level-operations/for.md#indexhint) & + [**Named Indexes**](https://www.arangodb.com/learn/development/index-hints-named-indices/): + Indexes can be given names and an optional AQL inline query option + `indexHint` was added to override the internal optimizer decision on which + index to utilize. + +- [**Data Masking**](../../components/tools/arangodump/maskings.md): + arangodump provides a convenient way to extract production data but mask + critical information that should not be visible. + +**Enterprise Edition** + +- [**Hot Backups**](../../operations/backup-and-restore.md#hot-backups): + Create automatic, consistent backups of your cluster without noticeable + impact on your production systems. In contrast to _arangodump_, hot backups + are taken on the level of the underlying storage engine and hence both backup + and restore are considerably faster. + +- [**SmartJoins**](../../develop/smartjoins.md): + Run joins between identically sharded collections with performance close to + that of a local join operation. + +- **Advanced Data Masking**: + There are additional + [data masking functions](../../components/tools/arangodump/maskings.md#masking-functions) + available in the Enterprise Edition, such as for substituting email addresses + and phone numbers with similar looking pseudo-data. + +Also see [What's New in 3.5](../../release-notes/version-3.5/whats-new-in-3-5.md). + +## Version 3.4 + +**All Editions** + +- [**ArangoSearch**](../../index-and-search/arangosearch/_index.md): + Search and similarity ranking engine integrated natively into ArangoDB and + AQL. ArangoSearch combines Boolean retrieval capabilities with generalized + ranking algorithms (BM25, TFDIF). Support of e.g. relevance-based searching, + phrase and prefix-matching, complex boolean searches and query time relevance + tuning. Search can be combined with all supported data models in a single + query. Many specialized language Analyzers are already included for e.g. + English, German, French, Chinese, Spanish and many other language. + +- [**GeoJSON Support**](../../aql/functions/geo.md) and + [**S2 Geo Index**](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md): ArangoDB now supports all geo primitives. + (Multi-)Point, (Multi-)LineStrings, (Multi-)Polygons or intersections can be + defined and queried for. The Google S2 geo index is optimized for RocksDB and + enables efficient querying. Geo query results are automatically visualized + with an OpenStreetMap integration within the Query Editor of the web interface. + +- [**Query Profiler**](../../aql/execution-and-performance/query-profiling.md): + Enables the analysis of queries and adds additional information for the user + to identify optimization potentials more easily. The profiler can be accessed + via _arangosh_ with `db._profileQuery(...)` or via the *Profile* button in the + Query Editor of the web interface. + +- [**Streaming Cursors**](../../aql/how-to-invoke-aql/with-arangosh.md#stream): + Cursors requested with the stream option on make queries calculate results + on the fly and make them available for the client in a streaming fashion, + as soon as possible. + +- **RocksDB as Default Storage Engine**: With ArangoDB 3.4 the default + [storage engine](../../components/arangodb-server/storage-engine.md) for fresh installations will + switch from MMFiles to RocksDB. Many optimizations have been made to RocksDB + since the first release in 3.2. For 3.4 we optimized the binary storage + format for improved insertion, implemented "optional caching", reduced the + replication catch-up time and much more. + +Also see [What's New in 3.4](../../release-notes/version-3.4/whats-new-in-3-4.md). + +## Version 3.3 + +**Enterprise Edition** + +- [**Datacenter-to-Datacenter Replication**](../../deploy/arangosync/deployment/_index.md): + Replicate the entire structure and content of an ArangoDB cluster + asynchronously to another cluster in a different datacenter with ArangoSync. + Multi-datacenter support means you can fallback to a replica of your cluster + in case of a disaster in one datacenter. + +- [**Encrypted Backups**](../../components/tools/arangodump/examples.md#encryption): + _arangodump_ can create backups encrypted with a secret key using AES256 + block cipher. + +**All Editions** + +- [**Server-level Replication**](../../release-notes/version-3.3/whats-new-in-3-3.md#server-level-replication): + In addition to per-database replication, there is now an additional + `globalApplier`. Start the global replication on the Follower once and all + current and future databases will be replicated from the Leader to the + Follower automatically. + +- [**Asynchronous Failover**](../../release-notes/version-3.3/whats-new-in-3-3.md#asynchronous-failover): + Make a single server instance resilient with a second server instance, one + as Leader and the other as asynchronously replicating Follower, with automatic + failover to the Follower if the Leader goes down. + +Also see [What's New in 3.3](../../release-notes/version-3.3/whats-new-in-3-3.md). + +## Version 3.2 + +**All Editions** + +- [**RocksDB Storage Engine**](../../components/arangodb-server/storage-engine.md): You can now use + as much data in ArangoDB as you can fit on your disk. Plus, you can enjoy + performance boosts on writes by having only document-level locks. + +- [**Pregel**](../../data-science/pregel/_index.md): + We implemented distributed graph processing with Pregel to discover hidden + patterns, identify communities and perform in-depth analytics of large graph + data sets. + +- [**Fault-Tolerant Foxx**](../../develop/http-api/foxx.md): The Foxx management + internals have been rewritten from the ground up to make sure + multi-coordinator cluster setups always keep their services in sync and + new Coordinators are fully initialized even when all existing Coordinators + are unavailable. + +**Enterprise Edition** + +- [**LDAP integration**](../../components/arangodb-server/ldap.md): Users and permissions + can be managed from outside ArangoDB with an LDAP server in different + authentication configurations. + +- [**Encryption at Rest**](../../operations/security/encryption-at-rest.md): Let the server + persist your sensitive data strongly encrypted to protect it even if the + physical storage medium gets stolen. + +- [**SatelliteCollections**](../../develop/satellitecollections.md): Faster join operations when + working with sharded datasets by synchronously replicating selected + collections to all DB-Servers in a cluster, so that joins can be + executed locally. + +Also see [What's New in 3.2](../../release-notes/version-3.2/whats-new-in-3-2.md). + +## Version 3.1 + +**All Editions** + +- [**Vertex-centric indexes**](../../index-and-search/indexing/working-with-indexes/vertex-centric-indexes.md): + AQL traversal queries can utilize secondary edge collection + indexes for better performance against graphs with supernodes. + +- [**VelocyPack over HTTP**](https://www.arangodb.com/2016/10/updated-java-drivers-with-arangodb-3-1/): + In addition to JSON, the binary storage format VelocyPack can now also be + used in transport over the HTTP protocol, as well as streamed using the new + bi-directional asynchronous binary protocol **VelocyStream**. + +**Enterprise Edition** + +- [**SmartGraphs**](../../graphs/smartgraphs/_index.md): Scale with graphs to a + cluster and stay performant. With SmartGraphs you can use the "smartness" + of your application layer to shard your graph efficiently to your machines + and let traversals run locally. + +- **Encryption Control**: Choose your level of [SSL encryption](../../components/arangodb-server/options.md#ssl) + +- [**Auditing**](../../operations/security/audit-logging.md): Keep a detailed log + of all the important things that happened in ArangoDB. + +Also see [What's New in 3.1](../../release-notes/version-3.1/whats-new-in-3-1.md). + +## Version 3.0 + +- [**self-organizing cluster**](../../deploy/cluster/_index.md) with + synchronous replication, master/master setup, shared nothing + architecture, cluster management Agency. + +- Deeply integrated, native [**AQL graph traversal**](../../aql/graphs/_index.md) + +- [**VelocyPack**](https://github.com/arangodb/velocypack) as new internal + binary storage format as well as for intermediate AQL values. + +- [**Persistent indexes**](../../index-and-search/indexing/working-with-indexes/persistent-indexes.md) via RocksDB suitable + for sorting and range queries. + +- [**Foxx 3.0**](../../develop/foxx-microservices/_index.md): overhauled JS framework for data-centric + microservices + +- Significantly improved [**Web Interface**](../../components/web-interface/_index.md) + +Also see [What's New in 3.0](../../release-notes/version-3.0/whats-new-in-3-0.md). diff --git a/site/content/arangodb/oem/about/use-cases.md b/site/content/arangodb/oem/about/use-cases.md new file mode 100644 index 0000000000..0128025595 --- /dev/null +++ b/site/content/arangodb/oem/about/use-cases.md @@ -0,0 +1,164 @@ +--- +title: ArangoDB Use Cases +menuTitle: Use Cases +weight: 15 +description: >- + ArangoDB is a database system with a large solution space because it combines + graphs, documents, key-value, search engine, and machine learning all in one +pageToc: + maxHeadlineLevel: 2 +aliases: + - ../introduction/use-cases +--- +## ArangoDB as a Graph Database + +ArangoDB as a graph database is a great fit for use cases like fraud detection, +knowledge graphs, recommendation engines, identity and access management, +network and IT operations, social media management, traffic management, and many +more. + +### Fraud Detection + +{{< image src="../../../images/icon-fraud-detection.png" alt="Fraud Detection icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Uncover illegal activities by discovering difficult-to-detect patterns. +ArangoDB lets you look beyond individual data points in disparate data sources, +allowing you to integrate and harmonize data to analyze activities and +relationships all together, for a broader view of connection patterns, to detect +complex fraudulent behavior such as fraud rings. + +### Recommendation Engine + +{{< image src="../../../images/icon-recommendation-engine.png" alt="Recommendation Engine icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Suggest products, services, and information to users based on data relationships. +For example, you can use ArangoDB together with PyTorch Geometric to build a +[movie recommendation system](https://www.arangodb.com/2022/04/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems/), +by analyzing the movies users watched and then predicting links between the two +with a graph neural network (GNN). + +### Network Management + +{{< image src="../../../images/icon-network-management.png" alt="Network Management icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Reduce downtime by connecting and visualizing network, infrastructure, and code. +Network devices and how they interconnect can naturally be modeled as a graph. +Traversal algorithms let you explore the routes between different nodes, with the +option to stop at subnet boundaries or to take things like the connection +bandwidth into account when path-finding. + +### Customer 360 + +{{< image src="../../../images/icon-customer-360.png" alt="Customer 360 icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Gain a complete understanding of your customers by integrating multiple data +sources and code. ArangoDB can act as the platform to merge and consolidate +information in any shape, with the added ability to link related records and to +track data origins using graph features. + +### Identity and Access Management + +{{< image src="../../../images/icon-identity-management.png" alt="Identity Management icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Increase security and compliance by managing data access based on role and +position. You can map out an organization chart as a graph and use ArangoDB to +determine who is authorized to see which information. Put ArangoDB's graph +capabilities to work to implement access control lists and permission +inheritance. + +### Supply Chain + +{{< image src="../../../images/icon-supply-chain.png" alt="Supply Chain icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Speed shipments by monitoring and optimizing the flow of goods through a +supply chain. You can represent your inventory, supplier, and delivery +information as a graph to understand what the possible sources of delays and +disruptions are. + +## ArangoDB as a Document Database + +ArangoDB can be used as the backend for heterogeneous content management, +e-commerce systems, Internet of Things applications, and more generally as a +persistence layer for a broad range of services that benefit from an agile +and scalable data store. + +### Content Management + +{{< image src="../../../images/icon-content-management.png" alt="Content management icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Store information of any kind without upfront schema declaration. ArangoDB is +schema-free, storing every data record as a self-contained document, allowing +you to manage heterogeneous content with ease. Build the next (headless) +content management system on top of ArangoDB. + +### E-Commerce Systems + +{{< image src="../../../images/icon-e-commerce.png" alt="E-commerce icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +ArangoDB combines data modeling freedom with strong consistency and resilience +features to power online shops and ordering systems. Handle product catalog data +with ease using any combination of free text and structured data, and process +checkouts with the necessary transactional guarantees. + +### Internet of Things + +{{< image src="../../../images/icon-internet-of-things.png" alt="Internet of things icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Collect sensor readings and other IoT data in ArangoDB for a single view of +everything. Store all data points in the same system that also lets you run +aggregation queries using sliding windows for efficient data analysis. + +## ArangoDB as a Key-Value Database + +{{< image src="../../../images/icon-key-value.png" alt="Key value icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +Key-value stores are the simplest kind of database systems. Each record is +stored as a block of data under a key that uniquely identifies the record. +The data is opaque, which means the system doesn't know anything about the +contained information, it simply stores it and can retrieve it for you via +the identifiers. + +This paradigm is used at the heart of ArangoDB and allows it to scale well, +but without the limitations of a pure key-value store. Every document has a +`_key` attribute, which is either user-provided or automatically generated. +You can create additional indexes and work with subsets of attributes as +needed, requiring the system to be aware of the stored data structures - unlike +pure key-value stores. + +While ArangoDB can store binary data, it is not designed for +binary large objects (BLOBs) and works best with small to medium-sized +JSON objects. + +For more information about how ArangoDB persists data, see +[Storage Engine](../components/arangodb-server/storage-engine.md). + +## ArangoDB as a Search Engine + +{{< image src="../../../images/icon-search-engine.png" alt="Search engine icon" style="float: right; padding: 0 20px; margin-bottom: 20px;">}} + +ArangoDB has a natively integrated search engine for a broad range of +information retrieval needs. It is powered by inverted indexes and can index +full-text, GeoJSON, as well as arbitrary JSON data. It supports various +kinds of search patterns (tokens, phrases, wildcard, fuzzy, geo-spatial, etc.) +and it can rank results by relevance and similarity using popular +scoring algorithms. + +It also features natural language processing (NLP) capabilities. +{{% comment %}} Experimental feature +and can classify or find similar terms using word embedding models. +{{% /comment %}} + +For more information about the search engine, see [ArangoSearch](../index-and-search/arangosearch/_index.md). + +## ArangoDB for Machine Learning + +You can use ArangoDB as the foundation for machine learning based on graphs +at enterprise scale. You can use it as a metadata store for model training +parameters, run analytical algorithms in the database, or serve operative +queries using data that you computed. + +ArangoDB integrates well into existing data infrastructures and provides +connectors for popular machine learning frameworks and data processing +ecosystems. + +![Machine Learning Architecture of ArangoDB](../../../images/machine-learning-architecture.png) diff --git a/site/content/arangodb/oem/aql/_index.md b/site/content/arangodb/oem/aql/_index.md new file mode 100644 index 0000000000..688215f3dc --- /dev/null +++ b/site/content/arangodb/oem/aql/_index.md @@ -0,0 +1,36 @@ +--- +title: AQL Documentation +menuTitle: AQL +weight: 70 +description: >- + The ArangoDB Query Language (AQL) lets you store, retrieve, and modify data + in various ways in ArangoDB +--- +AQL is mainly a declarative language, meaning that a query expresses what result +should be achieved but not how it should be achieved. AQL aims to be +human-readable and therefore uses keywords from the English language. Another +design goal of AQL was client independence, meaning that the language and syntax +are the same for all clients, no matter what programming language the clients +may use. Further design goals of AQL were the support of complex query patterns +and the different data models ArangoDB offers. + +In its purpose, AQL is similar to the Structured Query Language (SQL). AQL supports +reading and modifying collection data, but it doesn't support data-definition +operations such as creating and dropping databases, collections and indexes. +It is a pure data manipulation language (DML), not a data definition language +(DDL) or a data control language (DCL). + +The syntax of AQL queries is different to SQL, even if some keywords overlap. +Nevertheless, AQL should be easy to understand for anyone with an SQL background. + +The general workflow when executing a query is as follows: + +1. A client application ships an AQL query to the ArangoDB server. The query text + contains everything ArangoDB needs to compute the result set. +2. ArangoDB parses the query, executes it, and compiles the results. If the + query is invalid or cannot be executed, the server returns an error that + the client can process and react to. If the query can be executed + successfully, the server returns the query results (if any) to the client. + +For example queries, see the [Data Queries](data-queries.md) and +[Examples & Query Patterns](examples-and-query-patterns/_index.md) chapters. diff --git a/site/content/arangodb/oem/aql/common-errors.md b/site/content/arangodb/oem/aql/common-errors.md new file mode 100644 index 0000000000..352c3ae8e2 --- /dev/null +++ b/site/content/arangodb/oem/aql/common-errors.md @@ -0,0 +1,420 @@ +--- +title: Common Errors in AQL +menuTitle: Common Errors +weight: 55 +description: >- + Avoid injection vulnerabilities and avoid pitfalls like incorrect operator + usage performance issues when using ArangoDB's query language +--- +## Trailing semicolons in query strings + +Many SQL databases allow sending multiple queries at once. In this case, multiple +queries are separated using the semicolon character. Often it is also supported to +execute a single query that has a semicolon at its end. + +AQL does not support this, and it is a parse error to use a semicolon at the end +of an AQL query string. + +## String concatenation + +In AQL, strings must be concatenated using the [`CONCAT()`](functions/string.md#concat) +function. Joining them together with the `+` operator is not supported. Especially +as JavaScript programmer it is easy to walk into this trap: + +```aql +RETURN "foo" + "bar" // [ 0 ] +RETURN "foo" + 123 // [ 123 ] +RETURN "123" + 200 // [ 323 ] +``` + +The arithmetic plus operator expects numbers as operands, and will try to implicitly +cast them to numbers if they are of different type. `"foo"` and `"bar"` are casted +to `0` and then added to together (still zero). If an actual number is added, that +number will be returned (adding zero doesn't change the result). If the string is a +valid string representation of a number, then it is casted to a number. Thus, adding +`"123"` and `200` results in two numbers being added up to `323`. + +To concatenate elements (with implicit casting to string for non-string values), do: + +```aql +RETURN CONCAT("foo", "bar") // [ "foobar" ] +RETURN CONCAT("foo", 123) // [ "foo123" ] +RETURN CONCAT("123", 200) // [ "123200" ] +``` + +## Parameter injection vulnerability + +Parameter injection means that potentially malicious content is inserted into a +query which may change its meaning. It is a security issue that may allow an +attacker to execute arbitrary queries on the database data. + +It often occurs if applications trustfully insert user-provided inputs into a +query string, and do not fully or incorrectly filter them. It also occurs often +when applications build queries naively, without using security mechanisms often +provided by database software or querying mechanisms. + +AQL is not vulnerable to parameter injection in itself, but queries might be +constructed on the client-side, on an application server or in a Foxx service. +Assembling query strings with simple **string concatenation** looks trivial, +but is potentially **unsafe**. You should use +[bind parameters](fundamentals/bind-parameters.md) instead whenever possible, +use query building functionality if provided by a driver (see +[arangojs AQL Helpers](https://arangodb.github.io/arangojs/latest/functions/aql.aql.html) +for example) or at least sanitize user input with great care. + +### Parameter injection examples + +Below you find a simple query using the [JavaScript API](../develop/javascript-api/_index.md) +that is fed with some dynamic input value, pretending it coming from a web form. +This could be the case in a Foxx service. The route happily picks up the input +value, and puts it into a query: + +```js +// evil! +var what = req.params("searchValue"); // user input value from web form +// ... +var query = "FOR doc IN collection FILTER doc.value == " + what + " RETURN doc"; +db._query(query, params).toArray(); +``` + +The above will probably work fine for numeric input values. + +What could an attacker do to this query? Here are a few suggestions to use for +the `searchValue` parameter: + +- for returning all documents in the collection:\ + `1 || true` +- for removing all documents:\ + `1 || true REMOVE doc IN collection //` +- for inserting new documents:\ + `1 || true INSERT { foo: "bar" } IN collection //` + +It should have become obvious that this is extremely unsafe and should be +avoided. A pattern often seen to counteract this is trying to quote and escape +potentially unsafe input values before putting them into query strings. +This may work in some situations, but it is easy to overlook something or get +it subtly wrong: + +```js +// We are sanitizing now, but it is still evil! +var value = req.params("searchValue").replace(/'/g, ''); +// ... +var query = "FOR doc IN collection FILTER doc.value == '" + value + "' RETURN doc"; +db._query(query, params).toArray(); +``` + +The above example uses single quotes for enclosing the potentially unsafe user +input, and also replaces all single quotes in the input value beforehand. +Not only may that change the user input (leading to subtle errors such as +_"why does my search for `O'Brien` not return any results?"_), but it is +also still unsafe. If the user input contains a backslash at the end +(e.g. `foo bar\`), that backslash will escape the closing single quote, +allowing the user input to break out of the string fence again. + +It gets worse if user input is inserted into the query at multiple places. +Let us assume we have a query with two dynamic values: + +```js +query = "FOR doc IN collection FILTER doc.value == '" + value + + "' && doc.type == '" + type + "' RETURN doc"; +``` + +If an attacker inserted `\` for parameter `value` and +` || true REMOVE doc IN collection //` for parameter `type`, then the effective +query would become: + +```aql +FOR doc IN collection + FILTER doc.value == '\' && doc.type == ' || true + REMOVE doc IN collection //' RETURN doc +``` + +… which is highly undesirable. The backslash escapes the closing single quote, +turning the `doc.type` condition into a string, which gets compared to +`doc.value`. Further more, an always true or-condition as well as a remove +operation are injected, changing the query purpose entirely. The original +return operation gets commented out and the query will truncate the collection +instead of returning a few documents. + +### Avoiding parameter injection + +Instead of mixing query string fragments with user inputs naively via string +concatenation, use either **bind parameters** or a **query builder**. Both can +help to avoid the problem of injection, because they allow separating the actual +query operations (like `FOR`, `INSERT`, `REMOVE`) from (user input) values. + +Below, the focus is on bind parameters. This is not to say that query builders +shouldn't be used. They were simply omitted here for the sake of simplicity. + +#### What bind parameters are + +Bind parameters in AQL queries are special tokens that act as placeholders for +actual values. Here's an example: + +```aql +FOR doc IN collection + FILTER doc.value == @what + RETURN doc +``` + +In the above query, `@what` is a bind parameter. In order to execute this query, +a value for bind parameter `@what` must be specified. Otherwise query execution will +fail with error 1551 (*no value specified for declared bind parameter*). If a value +for `@what` gets specified, the query can be executed. However, the query string +and the bind parameter values (i.e. the contents of the `@what` bind parameter) will +be handled separately. What's in the bind parameter will always be treated as a value, +and it can't get out of its sandbox and change the semantic meaning of a query. + +#### How bind parameters are used + +To execute a query with bind parameters, the query string (containing the bind +parameters) and the bind parameter values are specified separately (note that when +the bind parameter value is assigned, the prefix `@` needs to be omitted): + +```js +// query string with bind parameter +var query = "FOR doc IN collection FILTER doc.value == @what RETURN doc"; + +// actual value for bind parameter +var params = { what: 42 }; + +// run query, specifying query string and bind parameter separately +db._query(query, params).toArray(); +``` + +If a malicious user would set `@what` to a value of `1 || true`, this wouldn't do +any harm. AQL would treat the contents of `@what` as a single string token, and +the meaning of the query would remain unchanged. The actually executed query would be: + +```aql +FOR doc IN collection + FILTER doc.value == "1 || true" + RETURN doc +``` + +Thanks to bind parameters it is also impossible to turn a selection (i.e. read-only) +query into a data deletion query. + +#### Using JavaScript variables as bind parameters + +There is also a template string generator function `aql` that can be used to safely +(and conveniently) built AQL queries using JavaScript variables and expressions. It +can be invoked as follows: + +```js +const aql = require('@arangodb').aql; // not needed in arangosh + +var value = "some input value"; +var query = aql`FOR doc IN collection + FILTER doc.value == ${value} + RETURN doc`; +var result = db._query(query).toArray(); +``` + +Note that an ES6 template string is used for populating the `query` variable. +The string is assembled using the `aql` generator function which is bundled +with ArangoDB. The template string can contain references to JavaScript +variables or expressions via `${...}`. In the above example, the query +references a variable named `value`. The `aql` function generates an object +with two separate attributes: the query string, containing references to +bind parameters, and the actual bind parameter values. + +Bind parameter names are automatically generated by the `aql` function: + +```js +var value = "some input value"; +aql`FOR doc IN collection FILTER doc.value == ${value} RETURN doc`; + +{ + "query" : "FOR doc IN collection FILTER doc.value == @value0 RETURN doc", + "bindVars" : { + "value0" : "some input value" + } +} +``` + +#### Using bind parameters in dynamic queries + +Bind parameters are helpful, so it makes sense to use them for handling the +dynamic values. You can even use them for queries that itself are highly +dynamic, for example with conditional `FILTER` and `LIMIT` parts. +Here's how to do this: + +```js +// Note: this example has a slight issue... hang on reading +var query = "FOR doc IN collection"; +var params = { }; + +if (useFilter) { + query += " FILTER doc.value == @what"; + params.what = req.params("searchValue"); +} + +if (useLimit) { + // not quite right, see below + query += " LIMIT @offset, @count"; + params.offset = req.params("offset"); + params.count = req.params("count"); +} + +query += " RETURN doc"; +db._query(query, params).toArray(); +``` + +Note that in this example we're back to string concatenation, but without the +problem of the query being vulnerable to arbitrary modifications. + +#### Input value validation and sanitation + +Still you should prefer to be paranoid, and try to detect invalid input values +as early as possible, at least before executing a query with them. This is +because some input parameters may affect the runtime behavior of queries +negatively or, when modified, may lead to queries throwing runtime errors +instead of returning valid results. This isn't something an attacker +should deserve. + +`LIMIT` is a good example for this: if used with a single argument, the +argument should be numeric. When `LIMIT` is given a string value, executing +the query will fail. You may want to detect this early and don't return an +HTTP 500 (as this would signal attackers that they were successful breaking +your application). + +Another problem with `LIMIT` is that high `LIMIT` values are likely more +expensive than low ones, and you may want to disallow using `LIMIT` values +exceeding a certain threshold. + +Here is what you could do in such cases: + +```js +var query = "FOR doc IN collection LIMIT @count RETURN doc"; + +// some default value for limit +var params = { count: 100 }; + +if (useLimit) { + var count = req.params("count"); + + // abort if value does not look like an integer + if (! preg_match(/^d+$/, count)) { + throw "invalid count value!"; + } + + // actually turn it into an integer + params.count = parseInt(count, 10); // turn into numeric value +} + +if (params.count < 1 || params.count > 1000) { + // value is outside of accepted thresholds + throw "invalid count value!"; +} + +db._query(query, params).toArray(); +``` + +This is a bit more complex, but that is a price you are likely willing to pay +for a bit of extra safety. In reality you may want to use a framework for +validation (such as [joi](https://www.npmjs.com/package/joi) +which comes bundled with ArangoDB) instead of writing your own checks all over +the place. + +#### Bind parameter types + +There are two types of bind parameters in AQL: + +- Bind parameters for **values**:\ + Those are prefixed with a single `@` in AQL queries, and are specified + without the prefix when they get their value assigned. These bind parameters + can contain any valid JSON value. + + Examples: `@what`, `@searchValue` + +- Bind parameters for **collections**:\ + These are prefixed with `@@` in AQL queries, and are replaced with the name + of a collection. When the bind parameter value is assigned, the parameter + itself must be specified with a single `@` prefix. Only string values are + allowed for this type of bind parameters. + + Examples: `@@collection`, `@@edgeColl` + +The latter type of bind parameter is probably not used as often, and it should +not be used together with user input. Otherwise users may freely determine on +which collection your AQL queries will operate on (this might be a valid +use case, but normally it is extremely undesired). + +## Unexpected long running queries + +Slow queries can have various reasons and be legitimate for queries with a high +computational complexity or if they touch a lot of data. Use the *Explain* +feature to inspect execution plans and verify that appropriate indexes are +utilized. Also check for mistakes such as references to the wrong variables. + +A literal collection name, which is not part of constructs like `FOR`, +`UPDATE ... IN` etc., stands for an array of all documents of that collection +and can cause an entire collection to be materialized before further +processing. It should thus be avoided. + +Check the execution plan for `/* all collection documents */` and verify that +it is intended. You should also see a warning if you execute such a query: + +> collection 'coll' used as expression operand + +For example, instead of: + +```aql +RETURN coll[* LIMIT 1] +``` + +... with the execution plan ... + +```aql +Execution plan: + Id NodeType Est. Comment + 1 SingletonNode 1 * ROOT + 2 CalculationNode 1 - LET #2 = coll /* all collection documents */[* LIMIT 0, 1] /* v8 expression */ + 3 ReturnNode 1 - RETURN #2 +``` + +... you can use the following equivalent query: + +```aql +FOR doc IN coll + LIMIT 1 + RETURN doc +``` + +... with the (better) execution plan: + +```aql +Execution plan: + Id NodeType Est. Comment + 1 SingletonNode 1 * ROOT + 2 EnumerateCollectionNode 44 - FOR doc IN Characters /* full collection scan */ + 3 LimitNode 1 - LIMIT 0, 1 + 4 ReturnNode 1 - RETURN doc +``` + +Similarly, make sure you have not confused any variable names with collection +names by accident: + +```aql +LET names = ["John", "Mary", ...] +// supposed to refer to variable "names", not collection "Names" +FOR name IN Names + ... +``` + +You can set the startup option `--query.allow-collections-in-expressions` to +*false* to disallow collection names in arbitrary places in AQL expressions +to prevent such mistakes. Also see +[ArangoDB Server Query Options](../components/arangodb-server/options.md#--queryallow-collections-in-expressions) + +{{% comment %}} +Rename to Error Sources? + +Quote marks around bind parameter placeholders +https://github.com/arangodb/arangodb/issues/1634#issuecomment-167808660 + +FILTER HAS(doc, "attr") instead of FILTER doc.attr / FILTER doc.attr != null + +collection ... not found error, e.g. access of variable after COLLECT (no longer existing) +{{% /comment %}} diff --git a/site/content/arangodb/oem/aql/data-queries.md b/site/content/arangodb/oem/aql/data-queries.md new file mode 100644 index 0000000000..1b07b4214a --- /dev/null +++ b/site/content/arangodb/oem/aql/data-queries.md @@ -0,0 +1,554 @@ +--- +title: AQL Data Queries +menuTitle: Data Queries +weight: 20 +description: >- + With AQL queries, you can read and write data in the form of documents +--- +There are two fundamental types of AQL queries: +- queries which access data (read documents) +- queries which modify data (create, update, replace, delete documents) + +## Data Access Queries + +Retrieving data from the database with AQL does always include a **RETURN** +operation. It can be used to return a static value, such as a string: + +```aql +RETURN "Hello ArangoDB!" +``` + +The query result is always an array of elements, even if a single element was +returned and contains a single element in that case: `["Hello ArangoDB!"]` + +The function `DOCUMENT()` can be called to retrieve a single document via +its document identifier, for instance: + +```aql +RETURN DOCUMENT("users/phil") +``` + +`RETURN` is usually accompanied by a **FOR** loop to iterate over the +documents of a collection. The following query executes the loop body for all +documents of a collection called `users`. Each document is returned unchanged +in this example: + +```aql +FOR doc IN users + RETURN doc +``` + +Instead of returning the raw `doc`, one can easily create a projection: + +```aql +FOR doc IN users + RETURN { user: doc, newAttribute: true } +``` + +For every user document, an object with two attributes is returned. The value +of the attribute `user` is set to the content of the user document, and +`newAttribute` is a static attribute with the boolean value `true`. + +Operations like **FILTER**, **SORT** and **LIMIT** can be added to the loop body +to narrow and order the result. Instead of above shown call to `DOCUMENT()`, +one can also retrieve the document that describes user `phil` like so: + +```aql +FOR doc IN users + FILTER doc._key == "phil" + RETURN doc +``` + +The document key is used in this example, but any other attribute could equally +be used for filtering. Since the document key is guaranteed to be unique, no +more than a single document can match this filter. For other attributes this +may not be the case. To return a subset of active users (determined by an +attribute called `status`), sorted by name in ascending order, you can do: + +```aql +FOR doc IN users + FILTER doc.status == "active" + SORT doc.name + LIMIT 10 +``` + +Note that operations do not have to occur in a fixed order and that their order +can influence the result significantly. Limiting the number of documents +before a filter is usually not what you want, because it easily misses a lot +of documents that would fulfill the filter criterion, but are ignored because +of a premature `LIMIT` clause. Because of the aforementioned reasons, `LIMIT` +is usually put at the very end, after `FILTER`, `SORT` and other operations. + +See the [High Level Operations](high-level-operations/_index.md) chapter for more details. + +## Data Modification Queries + +AQL supports the following data modification operations: + +- **INSERT**: insert new documents into a collection +- **UPDATE**: partially update existing documents in a collection +- **REPLACE**: completely replace existing documents in a collection +- **REMOVE**: remove existing documents from a collection +- **UPSERT**: conditionally insert or update documents in a collection + +You can use them to modify the data of one or multiple documents with a single +query. This is superior to fetching and updating the documents individually with +multiple queries. However, if only a single document needs to be modified, +ArangoDB's specialized data modification operations for single documents might +execute faster. + +Below you find some simple example queries that use these operations. +The operations are detailed in the chapter [High Level Operations](high-level-operations/_index.md). + +### Modifying a single document + +Let's start with the basics: `INSERT`, `UPDATE` and `REMOVE` operations on single documents. +Here is an example that inserts a document into a collection called `users` with +the [`INSERT` operation](high-level-operations/insert.md): + +```aql +INSERT { + firstName: "Anna", + name: "Pavlova", + profession: "artist" +} INTO users +``` + +The collection needs to exist before executing the query. AQL queries cannot +create collections. + +If you run the above query, the result is an empty array because we did +not specify what to return using a `RETURN` keyword. It is optional in +modification queries, but mandatory in data access queries. Despite the empty +result, the above query still creates a new user document. + +You may provide a key for the new document; if not provided, ArangoDB creates one for you. + +```aql +INSERT { + _key: "GilbertoGil", + firstName: "Gilberto", + name: "Gil", + city: "Fortalezza" +} INTO users +``` + +As ArangoDB is schema-free, attributes of the documents may vary: + +```aql +INSERT { + _key: "PhilCarpenter", + firstName: "Phil", + name: "Carpenter", + middleName: "G.", + status: "inactive" +} INTO users +``` + +```aql +INSERT { + _key: "NatachaDeclerck", + firstName: "Natacha", + name: "Declerck", + location: "Antwerp" +} INTO users +``` + +The [`UPDATE` operation](high-level-operations/update.md) lets you add or change +attributes of existing documents. The following query modifies a previously +created user, changing the `status` attribute and adding a `location` attribute: + +```aql +UPDATE "PhilCarpenter" WITH { + status: "active", + location: "Beijing" +} IN users +``` + +The [`REPLACE` operation](high-level-operations/replace.md) is an alternative to the +`UPDATE` operation that lets you replace all attributes of a document +(except for attributes that cannot be changed, like `_key`): + +```aql +REPLACE { + _key: "NatachaDeclerck", + firstName: "Natacha", + name: "Leclerc", + status: "active", + level: "premium" +} IN users +``` + +You can delete a document with the [`REMOVE` operation](high-level-operations/remove.md), +only requiring the document key to identify it: + +```aql +REMOVE "GilbertoGil" IN users +``` + +### Modifying multiple documents + +Data modification operations are normally combined with `FOR` loops to +iterate over a given list of documents. They can optionally be combined with +`FILTER` statements and the like. + +To create multiple new documents, use the `INSERT` operation together with `FOR`. +You can also use `INSERT` to generate copies of existing documents from other +collections, or to create synthetic documents (e.g. for testing purposes). +The following query creates 1000 test users with some attributes and stores +them in the `users` collection: + +```aql +FOR i IN 1..1000 + INSERT { + id: 100000 + i, + age: 18 + FLOOR(RAND() * 25), + name: CONCAT('test', TO_STRING(i)), + status: i % 2 == 0 ? "active" : "not active", + active: false, + gender: i % 3 == 0 ? "male" : i % 3 == 1 ? "female" : "diverse" + } IN users +``` + +Let's modify existing documents that match some condition: + +```aql +FOR u IN users + FILTER u.status == "not active" + UPDATE u WITH { status: "inactive" } IN users +``` + +You can also update existing attributes based on their previous value: + +```aql +FOR u IN users + FILTER u.active == true + UPDATE u WITH { numberOfLogins: u.numberOfLogins + 1 } IN users +``` + +The above query only works if there is already a `numberOfLogins` attribute +present in the document. If it is unclear whether there is a `numberOfLogins` +attribute in the document, the increase must be made conditional: + +```aql +FOR u IN users + FILTER u.active == true + UPDATE u WITH { + numberOfLogins: HAS(u, "numberOfLogins") ? u.numberOfLogins + 1 : 1 + } IN users +``` + +Updates of multiple attributes can be combined in a single query: + +```aql +FOR u IN users + FILTER u.active == true + UPDATE u WITH { + lastLogin: DATE_NOW(), + numberOfLogins: HAS(u, "numberOfLogins") ? u.numberOfLogins + 1 : 1 + } IN users +``` + +Note than an update query might fail during execution, for example, because a +document to be updated does not exist. In this case, the query aborts at +the first error. In single server mode, all modifications done by the query +are rolled back as if they never happened. + +You can copy documents from one collection to another by reading from one +collection but write to another. +Let's copy the contents of the `users` collection into the `backup` collection: + +```aql +FOR u IN users + INSERT u IN backup +``` + +Note that both collections must already exist when the query is executed. +The query might fail if the `backup` collection already contains documents, +as executing the insert might attempt to insert the same document (identified +by the `_key` attribute) again. This triggers a unique key constraint violation +and aborts the query. In single server mode, all changes made by the query +are also rolled back. +To make such a copy operation work in all cases, the target collection can +be emptied beforehand, using a `REMOVE` query or by truncating it by other means. + +To not just partially update, but completely replace existing documents, use +the `REPLACE` operation. +The following query replaces all documents in the `backup` collection with +the documents found in the `users` collection. Documents common to both +collections are replaced. All other documents remain unchanged. +Documents are compared using their `_key` attributes: + +```aql +FOR u IN users + REPLACE u IN backup +``` + +The above query fails if there are documents in the `users` collection that are +not in the `backup` collection yet. In this case, the query would attempt to replace +documents that do not exist. If such case is detected while executing the query, +the query is aborted. In single server mode, all changes made by the query are +rolled back. + +To make the query succeed regardless of the errors, use the `ignoreErrors` +query option: + +```aql +FOR u IN users + REPLACE u IN backup OPTIONS { ignoreErrors: true } +``` + +This continues the query execution if errors occur during a `REPLACE`, `UPDATE`, +`INSERT`, or `REMOVE` operation. + +Finally, let's find some documents in collection `users` and remove them +from collection `backup`. The link between the documents in both collections is +established via the documents' keys: + +```aql +FOR u IN users + FILTER u.status == "deleted" + REMOVE u IN backup +``` + +The following example removes all documents from both `users` and `backup`: + +```aql +LET r1 = (FOR u IN users REMOVE u IN users) +LET r2 = (FOR u IN backup REMOVE u IN backup) +RETURN true +``` + +### Altering substructures + +To modify lists in documents, for example, to update specific attributes of +objects in an array, you can compute a new array and then update the document +attribute in question. This may involve the use of subqueries and temporary +variables. + +Create a collection named `complexCollection` and run the following query: + +```aql +FOR doc IN [ + { + "topLevelAttribute": "a", + "subList": [ + { + "attributeToAlter": "value to change", + "filterByMe": true + }, + { + "attributeToAlter": "another value to change", + "filterByMe": true + }, + { + "attributeToAlter": "keep this value", + "filterByMe": false + } + ] + }, + { + "topLevelAttribute": "b", + "subList": [ + { + "attributeToAlter": "keep this value", + "filterByMe": false + } + ] + } +] INSERT doc INTO complexCollection +``` + +The following query updates the `subList` top-level attribute of documents. +The `attributeToAlter` values in the nested object are changed if the adjacent +`filterByMe` attribute is `true`: + +```aql +FOR doc in complexCollection + LET alteredList = ( + FOR element IN doc.subList + RETURN element.filterByMe + ? MERGE(element, { attributeToAlter: "new value" }) + : element + ) + UPDATE doc WITH { subList: alteredList } IN complexCollection + RETURN NEW +``` + +```json +[ + { + "_key": "2607", + "_id": "complexCollection/2607", + "_rev": "_fWb_iOO---", + "topLevelAttribute": "a", + "subList": [ + { + "attributeToAlter": "new value", + "filterByMe": true + }, + { + "attributeToAlter": "new value", + "filterByMe": true + }, + { + "attributeToAlter": "keep this value", + "filterByMe": false + } + ] + }, + { + "_key": "2608", + "_id": "complexCollection/2608", + "_rev": "_fWb_iOO--_", + "topLevelAttribute": "b", + "subList": [ + { + "attributeToAlter": "keep this value", + "filterByMe": false + } + ] + } +] +``` + +To improve the query's performance, you can only update documents if there is +a change to the `subList` to be saved. Instead of comparing the current and the +altered list directly, you may compare their hash values using the +[`HASH()` function](functions/miscellaneous.md#hash), which is faster for +larger objects and arrays. You can also replace the subquery with an +[inline expression](operators.md#inline-expressions): + +```aql +FOR doc in complexCollection + LET alteredList = doc.subList[* + RETURN CURRENT.filterByMe + ? MERGE(CURRENT, { attributeToAlter: "new value" }) + : CURRENT + ] + FILTER HASH(doc.subList) != HASH(alteredList) + UPDATE doc WITH { subList: alteredList } IN complexCollection + RETURN NEW +``` + +### Returning documents + +Data modification queries can optionally return documents. In order to reference +the inserted, removed or modified documents in a `RETURN` statement, data modification +statements introduce the `OLD` and/or `NEW` pseudo-values: + +```aql +FOR i IN 1..100 + INSERT { value: i } IN test + RETURN NEW +``` + +```aql +FOR u IN users + FILTER u.status == "deleted" + REMOVE u IN users + RETURN OLD +``` + +```aql +FOR u IN users + FILTER u.status == "not active" + UPDATE u WITH { status: "inactive" } IN users + RETURN NEW +``` + +`NEW` refers to the inserted or modified document revision, and `OLD` refers +to the document revision before update or removal. `INSERT` statements can +only refer to the `NEW` pseudo-value, and `REMOVE` operations only to `OLD`. +`UPDATE`, `REPLACE` and `UPSERT` can refer to either. + +In all cases, the full documents are returned with all their attributes, +including the potentially auto-generated attributes, such as `_id`, `_key`, and `_rev`, +and the attributes not specified in the update expression of a partial update. + +#### Projections of OLD and NEW + +It is possible to return a projection of the documents with `OLD` or `NEW` instead of +returning the entire documents. This can be used to reduce the amount of data returned +by queries. + +For example, the following query returns only the keys of the inserted documents: + +```aql +FOR i IN 1..100 + INSERT { value: i } IN test + RETURN NEW._key +``` + +#### Using OLD and NEW in the same query + +For `UPDATE`, `REPLACE`, and `UPSERT` operations, both `OLD` and `NEW` can be used +to return the previous revision of a document together with the updated revision: + +```aql +FOR u IN users + FILTER u.status == "not active" + UPDATE u WITH { status: "inactive" } IN users + RETURN { old: OLD, new: NEW } +``` + +#### Calculations with OLD or NEW + +It is also possible to run additional calculations with `LET` statements between the +data modification part and the final `RETURN` of an AQL query. For example, the following +query performs an upsert operation and returns whether an existing document was +updated, or a new document was inserted. It does so by checking the `OLD` variable +after the `UPSERT` and using a `LET` statement to store a temporary string for +the operation type: + +```aql +UPSERT { name: "test" } + INSERT { name: "test" } + UPDATE { } IN users +LET opType = IS_NULL(OLD) ? "insert" : "update" +RETURN { _key: NEW._key, type: opType } +``` + +### Restrictions + +The name of the modified collection (`users` and `backup` in the above cases) +must be known to the AQL executor at query-compile time and cannot change at +runtime. Using a bind parameter to specify the +[collection name](../concepts/data-structure/collections.md#collection-names) is allowed. + +It is not possible to use multiple data modification operations for the same +collection in the same query, or follow up a data modification operation for a +specific collection with a read operation for the same collection. Neither is +it possible to follow up any data modification operation with a traversal query +(which may read from arbitrary collections not necessarily known at the start of +the traversal). + +That means you may not place several `REMOVE` or `UPDATE` statements for the same +collection into the same query. It is however possible to modify different collections +by using multiple data modification operations for different collections in the +same query. +In case you have a query with several places that need to remove documents from the +same collection, it is recommended to collect these documents or their keys in an array +and have the documents from that array removed using a single `REMOVE` operation. + +Data modification operations can optionally be followed by `LET` operations to +perform further calculations and a `RETURN` operation to return data. + +### Transactional Execution + +On a single server, data modification operations are executed transactionally. +If a data modification operation fails, any changes made by it are rolled +back automatically as if they never happened. + +A query may execute intermediate transaction commits in case the running +transaction (AQL query) hits the specified size thresholds. In this case, the +query's operations carried out so far are committed and not rolled back in case +of a later abort/rollback. This behavior can be controlled by adjusting the +intermediate commit settings for the RocksDB engine. See +[Known limitations for AQL queries](fundamentals/limitations.md#storage-engine-properties). + +In a cluster, AQL data modification queries are not executed transactionally. +Additionally, AQL queries with `UPDATE`, `REPLACE`, `UPSERT`, or `REMOVE` +operations require the `_key` attribute to be specified for all documents that +should be modified or removed, even if a shard key attribute other than `_key` +is chosen for the collection. diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/_index.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/_index.md new file mode 100644 index 0000000000..bd78af7eee --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/_index.md @@ -0,0 +1,115 @@ +--- +title: AQL Query Patterns and Examples +menuTitle: Examples & Query Patterns +weight: 40 +description: >- + Create test data, count documents, use joins, group attributes, traverse + graphs, and other examples +--- +These pages contain some common query patterns with examples. For better +understandability the query results are also included directly below each query. + +Normally, you would want to run queries on data stored in collections. +This section will provide several examples for that. + +Some of the following example queries are executed on a collection _users_ +with the data provided here below. + +## Things to consider when running queries on collections + +Note that all documents created in any collections will automatically get the +following server-generated attributes: + +- `_id`: A unique id, consisting of [collection name](../../concepts/data-structure/collections.md#collection-names) + and a server-side sequence value +- `_key`: The server sequence value +- `_rev`: The document's revision id + +Whenever you run queries on the documents in collections, don't be surprised if +these additional attributes are returned as well. + +Please also note that with real-world data, you might want to create additional +indexes on the data (left out here for brevity). Adding indexes on attributes that are +used in `FILTER` statements may considerably speed up queries. Furthermore, instead of +using attributes such as `id`, `from` and `to`, you might want to use the built-in +`_id`, `_from` and `_to` attributes. Finally, [edge collections](../../concepts/data-models.md#graph-model) +provide a nice way of establishing references/links between documents. +These features have been left out here for brevity as well. + +## Example data + +Some of the following example queries are executed on a collection *users* +with the following initial data: + +```json +[ + { "id": 100, "name": "John", "age": 37, "active": true, "gender": "m" }, + { "id": 101, "name": "Fred", "age": 36, "active": true, "gender": "m" }, + { "id": 102, "name": "Jacob", "age": 35, "active": false, "gender": "m" }, + { "id": 103, "name": "Ethan", "age": 34, "active": false, "gender": "m" }, + { "id": 104, "name": "Michael", "age": 33, "active": true, "gender": "m" }, + { "id": 105, "name": "Alexander", "age": 32, "active": true, "gender": "m" }, + { "id": 106, "name": "Daniel", "age": 31, "active": true, "gender": "m" }, + { "id": 107, "name": "Anthony", "age": 30, "active": true, "gender": "m" }, + { "id": 108, "name": "Jim", "age": 29, "active": true, "gender": "m" }, + { "id": 109, "name": "Diego", "age": 28, "active": true, "gender": "m" }, + { "id": 200, "name": "Sophia", "age": 37, "active": true, "gender": "f" }, + { "id": 201, "name": "Emma", "age": 36, "active": true, "gender": "f" }, + { "id": 202, "name": "Olivia", "age": 35, "active": false, "gender": "f" }, + { "id": 203, "name": "Madison", "age": 34, "active": true, "gender": "x" }, + { "id": 204, "name": "Chloe", "age": 33, "active": true, "gender": "f" }, + { "id": 205, "name": "Eva", "age": 32, "active": false, "gender": "f" }, + { "id": 206, "name": "Abigail", "age": 31, "active": true, "gender": "f" }, + { "id": 207, "name": "Isabella", "age": 30, "active": true, "gender": "f" }, + { "id": 208, "name": "Mary", "age": 29, "active": true, "gender": "f" }, + { "id": 209, "name": "Mariah", "age": 28, "active": true, "gender": "f" } +] +``` + +For some of the examples, we'll also use a collection *relations* to store +relationships between users. The example data for *relations* are as follows: + +```json +[ + { "from": 209, "to": 205, "type": "friend" }, + { "from": 206, "to": 108, "type": "friend" }, + { "from": 202, "to": 204, "type": "friend" }, + { "from": 200, "to": 100, "type": "friend" }, + { "from": 205, "to": 101, "type": "friend" }, + { "from": 209, "to": 203, "type": "friend" }, + { "from": 200, "to": 203, "type": "friend" }, + { "from": 100, "to": 208, "type": "friend" }, + { "from": 101, "to": 209, "type": "friend" }, + { "from": 206, "to": 102, "type": "friend" }, + { "from": 104, "to": 100, "type": "friend" }, + { "from": 104, "to": 108, "type": "friend" }, + { "from": 108, "to": 209, "type": "friend" }, + { "from": 206, "to": 106, "type": "friend" }, + { "from": 204, "to": 105, "type": "friend" }, + { "from": 208, "to": 207, "type": "friend" }, + { "from": 102, "to": 108, "type": "friend" }, + { "from": 207, "to": 203, "type": "friend" }, + { "from": 203, "to": 106, "type": "friend" }, + { "from": 202, "to": 108, "type": "friend" }, + { "from": 201, "to": 203, "type": "friend" }, + { "from": 105, "to": 100, "type": "friend" }, + { "from": 100, "to": 109, "type": "friend" }, + { "from": 207, "to": 109, "type": "friend" }, + { "from": 103, "to": 203, "type": "friend" }, + { "from": 208, "to": 104, "type": "friend" }, + { "from": 105, "to": 104, "type": "friend" }, + { "from": 103, "to": 208, "type": "friend" }, + { "from": 203, "to": 107, "type": "boyfriend" }, + { "from": 107, "to": 203, "type": "girlfriend" }, + { "from": 208, "to": 109, "type": "boyfriend" }, + { "from": 109, "to": 208, "type": "girlfriend" }, + { "from": 106, "to": 205, "type": "girlfriend" }, + { "from": 205, "to": 106, "type": "boyfriend" }, + { "from": 103, "to": 209, "type": "girlfriend" }, + { "from": 209, "to": 103, "type": "boyfriend" }, + { "from": 201, "to": 102, "type": "boyfriend" }, + { "from": 102, "to": 201, "type": "girlfriend" }, + { "from": 206, "to": 100, "type": "boyfriend" }, + { "from": 100, "to": 206, "type": "girlfriend" } +] +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/actors-and-movies-dataset-queries.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/actors-and-movies-dataset-queries.md new file mode 100644 index 0000000000..ef1c1f17d5 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/actors-and-movies-dataset-queries.md @@ -0,0 +1,859 @@ +--- +title: AQL Example Queries on an Actors and Movies Dataset +menuTitle: Actors & Movies Dataset Queries +weight: 35 +description: >- + Example queries showing different AQL query features and combinations of them +--- +Given a graph `[actors] – actsIn → [movies]` with two vertex collections +**actors** and **movies** and an edge collection **actsIn** with edges pointing +from actor to movie, plenty of interesting queries are possible: + +- All actors who acted in "movie1" OR "movie2" +- All actors who acted in both "movie1" AND "movie2" +- All common movies between "actor1" and "actor2" +- All actors who acted in 3 or more movies +- All movies where exactly 6 actors acted in +- The number of actors by movie +- The number of movies by actor +- The number of movies acted in between two years by actor +- The years and number of movies by actor with actor name + +## Dataset + +We will be using _arangosh_ to create and query the data. All AQL queries are +strings and can simply be copied over to the web interface or your favorite +driver as well. + +```js +var actors = db._create("actors"); +var movies = db._create("movies"); +var actsIn = db._createEdgeCollection("actsIn"); + +var TheMatrix = movies.save({ _key: "TheMatrix", title: "The Matrix", released: 1999, tagline: "Welcome to the Real World" })._id; +var Keanu = actors.save({ _key: "Keanu", name: "Keanu Reeves", born: 1964 })._id; +var Carrie = actors.save({ _key: "Carrie", name: "Carrie-Anne Moss", born: 1967 })._id; +var Laurence = actors.save({ _key: "Laurence", name: "Laurence Fishburne", born: 1961 })._id; +var Hugo = actors.save({ _key: "Hugo", name: "Hugo Weaving", born: 1960 })._id; +var Emil = actors.save({ _key: "Emil", name: "Emil Eifrem", born: 1978 }); + +actsIn.save(Keanu, TheMatrix, { roles: ["Neo"], year: 1999 }); +actsIn.save(Carrie, TheMatrix, { roles: ["Trinity"], year: 1999 }); +actsIn.save(Laurence, TheMatrix, { roles: ["Morpheus"], year: 1999 }); +actsIn.save(Hugo, TheMatrix, { roles: ["Agent Smith"], year: 1999 }); +actsIn.save(Emil, TheMatrix, { roles: ["Emil"], year: 1999 }); + +var TheMatrixReloaded = movies.save({ _key: "TheMatrixReloaded", title: "The Matrix Reloaded", released: 2003, tagline: "Free your mind" }); +actsIn.save(Keanu, TheMatrixReloaded, { roles: ["Neo"], year: 2003 }); +actsIn.save(Carrie, TheMatrixReloaded, { roles: ["Trinity"], year: 2003 }); +actsIn.save(Laurence, TheMatrixReloaded, { roles: ["Morpheus"], year: 2003 }); +actsIn.save(Hugo, TheMatrixReloaded, { roles: ["Agent Smith"], year: 2003 }); + +var TheMatrixRevolutions = movies.save({ _key: "TheMatrixRevolutions", title: "The Matrix Revolutions", released: 2003, tagline: "Everything that has a beginning has an end" }); +actsIn.save(Keanu, TheMatrixRevolutions, { roles: ["Neo"], year: 2003 }); +actsIn.save(Carrie, TheMatrixRevolutions, { roles: ["Trinity"], year: 2003 }); +actsIn.save(Laurence, TheMatrixRevolutions, { roles: ["Morpheus"], year: 2003 }); +actsIn.save(Hugo, TheMatrixRevolutions, { roles: ["Agent Smith"], year: 2003 }); + +var TheDevilsAdvocate = movies.save({ _key: "TheDevilsAdvocate", title: "The Devil's Advocate", released: 1997, tagline: "Evil has its winning ways" })._id; +var Charlize = actors.save({ _key: "Charlize", name: "Charlize Theron", born: 1975 })._id; +var Al = actors.save({ _key: "Al", name: "Al Pacino", born: 1940 })._id; +actsIn.save(Keanu, TheDevilsAdvocate, { roles: ["Kevin Lomax"], year: 1997 }); +actsIn.save(Charlize, TheDevilsAdvocate, { roles: ["Mary Ann Lomax"], year: 1997 }); +actsIn.save(Al, TheDevilsAdvocate, { roles: ["John Milton"], year: 1997 }); + +var AFewGoodMen = movies.save({ _key: "AFewGoodMen", title: "A Few Good Men", released: 1992, tagline: "In the heart of the nation's capital, in a courthouse of the U.S. government, one man will stop at nothing to keep his honor, and one will stop at nothing to find the truth." })._id; +var TomC = actors.save({ _key: "TomC", name: "Tom Cruise", born: 1962 })._id; +var JackN = actors.save({ _key: "JackN", name: "Jack Nicholson", born: 1937 })._id; +var DemiM = actors.save({ _key: "DemiM", name: "Demi Moore", born: 1962 })._id; +var KevinB = actors.save({ _key: "KevinB", name: "Kevin Bacon", born: 1958 })._id; +var KieferS = actors.save({ _key: "KieferS", name: "Kiefer Sutherland", born: 1966 })._id; +var NoahW = actors.save({ _key: "NoahW", name: "Noah Wyle", born: 1971 })._id; +var CubaG = actors.save({ _key: "CubaG", name: "Cuba Gooding Jr.", born: 1968 })._id; +var KevinP = actors.save({ _key: "KevinP", name: "Kevin Pollak", born: 1957 })._id; +var JTW = actors.save({ _key: "JTW", name: "J.T. Walsh", born: 1943 })._id; +var JamesM = actors.save({ _key: "JamesM", name: "James Marshall", born: 1967 })._id; +var ChristopherG = actors.save({ _key: "ChristopherG", name: "Christopher Guest", born: 1948 })._id; +actsIn.save(TomC, AFewGoodMen, { roles: ["Lt. Daniel Kaffee"], year: 1992 }); +actsIn.save(JackN, AFewGoodMen, { roles: ["Col. Nathan R. Jessup"], year: 1992 }); +actsIn.save(DemiM, AFewGoodMen, { roles: ["Lt. Cdr. JoAnne Galloway"], year: 1992 }); +actsIn.save(KevinB, AFewGoodMen, { roles: ["Capt. Jack Ross"], year: 1992 }); +actsIn.save(KieferS, AFewGoodMen, { roles: ["Lt. Jonathan Kendrick"], year: 1992 }); +actsIn.save(NoahW, AFewGoodMen, { roles: ["Cpl. Jeffrey Barnes"], year: 1992 }); +actsIn.save(CubaG, AFewGoodMen, { roles: ["Cpl. Carl Hammaker"], year: 1992 }); +actsIn.save(KevinP, AFewGoodMen, { roles: ["Lt. Sam Weinberg"], year: 1992 }); +actsIn.save(JTW, AFewGoodMen, { roles: ["Lt. Col. Matthew Andrew Markinson"], year: 1992 }); +actsIn.save(JamesM, AFewGoodMen, { roles: ["Pfc. Louden Downey"], year: 1992 }); +actsIn.save(ChristopherG, AFewGoodMen, { roles: ["Dr. Stone"], year: 1992 }); + +var TopGun = movies.save({ _key: "TopGun", title: "Top Gun", released: 1986, tagline: "I feel the need, the need for speed." })._id; +var KellyM = actors.save({ _key: "KellyM", name: "Kelly McGillis", born: 1957 })._id; +var ValK = actors.save({ _key: "ValK", name: "Val Kilmer", born: 1959 })._id; +var AnthonyE = actors.save({ _key: "AnthonyE", name: "Anthony Edwards", born: 1962 })._id; +var TomS = actors.save({ _key: "TomS", name: "Tom Skerritt", born: 1933 })._id; +var MegR = actors.save({ _key: "MegR", name: "Meg Ryan", born: 1961 })._id; +actsIn.save(TomC, TopGun, { roles: ["Maverick"], year: 1986 }); +actsIn.save(KellyM, TopGun, { roles: ["Charlie"], year: 1986 }); +actsIn.save(ValK, TopGun, { roles: ["Iceman"], year: 1986 }); +actsIn.save(AnthonyE, TopGun, { roles: ["Goose"], year: 1986 }); +actsIn.save(TomS, TopGun, { roles: ["Viper"], year: 1986 }); +actsIn.save(MegR, TopGun, { roles: ["Carole"], year: 1986 }); + +var JerryMaguire = movies.save({ _key: "JerryMaguire", title: "Jerry Maguire", released: 2000, tagline: "The rest of his life begins now." })._id; +var ReneeZ = actors.save({ _key: "ReneeZ", name: "Renee Zellweger", born: 1969 })._id; +var KellyP = actors.save({ _key: "KellyP", name: "Kelly Preston", born: 1962 })._id; +var JerryO = actors.save({ _key: "JerryO", name: "Jerry O'Connell", born: 1974 })._id; +var JayM = actors.save({ _key: "JayM", name: "Jay Mohr", born: 1970 })._id; +var BonnieH = actors.save({ _key: "BonnieH", name: "Bonnie Hunt", born: 1961 })._id; +var ReginaK = actors.save({ _key: "ReginaK", name: "Regina King", born: 1971 })._id; +var JonathanL = actors.save({ _key: "JonathanL", name: "Jonathan Lipnicki", born: 1996 })._id; +actsIn.save(TomC, JerryMaguire, { roles: ["Jerry Maguire"], year: 2000 }); +actsIn.save(CubaG, JerryMaguire, { roles: ["Rod Tidwell"], year: 2000 }); +actsIn.save(ReneeZ, JerryMaguire, { roles: ["Dorothy Boyd"], year: 2000 }); +actsIn.save(KellyP, JerryMaguire, { roles: ["Avery Bishop"], year: 2000 }); +actsIn.save(JerryO, JerryMaguire, { roles: ["Frank Cushman"], year: 2000 }); +actsIn.save(JayM, JerryMaguire, { roles: ["Bob Sugar"], year: 2000 }); +actsIn.save(BonnieH, JerryMaguire, { roles: ["Laurel Boyd"], year: 2000 }); +actsIn.save(ReginaK, JerryMaguire, { roles: ["Marcee Tidwell"], year: 2000 }); +actsIn.save(JonathanL, JerryMaguire, { roles: ["Ray Boyd"], year: 2000 }); + +var StandByMe = movies.save({ _key: "StandByMe", title: "Stand By Me", released: 1986, tagline: "For some, it's the last real taste of innocence, and the first real taste of life. But for everyone, it's the time that memories are made of." })._id; +var RiverP = actors.save({ _key: "RiverP", name: "River Phoenix", born: 1970 })._id; +var CoreyF = actors.save({ _key: "CoreyF", name: "Corey Feldman", born: 1971 })._id; +var WilW = actors.save({ _key: "WilW", name: "Wil Wheaton", born: 1972 })._id; +var JohnC = actors.save({ _key: "JohnC", name: "John Cusack", born: 1966 })._id; +var MarshallB = actors.save({ _key: "MarshallB", name: "Marshall Bell", born: 1942 })._id; +actsIn.save(WilW, StandByMe, { roles: ["Gordie Lachance"], year: 1986 }); +actsIn.save(RiverP, StandByMe, { roles: ["Chris Chambers"], year: 1986 }); +actsIn.save(JerryO, StandByMe, { roles: ["Vern Tessio"], year: 1986 }); +actsIn.save(CoreyF, StandByMe, { roles: ["Teddy Duchamp"], year: 1986 }); +actsIn.save(JohnC, StandByMe, { roles: ["Denny Lachance"], year: 1986 }); +actsIn.save(KieferS, StandByMe, { roles: ["Ace Merrill"], year: 1986 }); +actsIn.save(MarshallB, StandByMe, { roles: ["Mr. Lachance"], year: 1986 }); + +var AsGoodAsItGets = movies.save({ _key: "AsGoodAsItGets", title: "As Good as It Gets", released: 1997, tagline: "A comedy from the heart that goes for the throat." })._id; +var HelenH = actors.save({ _key: "HelenH", name: "Helen Hunt", born: 1963 })._id; +var GregK = actors.save({ _key: "GregK", name: "Greg Kinnear", born: 1963 })._id; +actsIn.save(JackN, AsGoodAsItGets, { roles: ["Melvin Udall"], year: 1997 }); +actsIn.save(HelenH, AsGoodAsItGets, { roles: ["Carol Connelly"], year: 1997 }); +actsIn.save(GregK, AsGoodAsItGets, { roles: ["Simon Bishop"], year: 1997 }); +actsIn.save(CubaG, AsGoodAsItGets, { roles: ["Frank Sachs"], year: 1997 }); + +var WhatDreamsMayCome = movies.save({ _key: "WhatDreamsMayCome", title: "What Dreams May Come", released: 1998, tagline: "After life there is more. The end is just the beginning." })._id; +var AnnabellaS = actors.save({ _key: "AnnabellaS", name: "Annabella Sciorra", born: 1960 })._id; +var MaxS = actors.save({ _key: "MaxS", name: "Max von Sydow", born: 1929 })._id; +var WernerH = actors.save({ _key: "WernerH", name: "Werner Herzog", born: 1942 })._id; +var Robin = actors.save({ _key: "Robin", name: "Robin Williams", born: 1951 })._id; +actsIn.save(Robin, WhatDreamsMayCome, { roles: ["Chris Nielsen"], year: 1998 }); +actsIn.save(CubaG, WhatDreamsMayCome, { roles: ["Albert Lewis"], year: 1998 }); +actsIn.save(AnnabellaS, WhatDreamsMayCome, { roles: ["Annie Collins-Nielsen"], year: 1998 }); +actsIn.save(MaxS, WhatDreamsMayCome, { roles: ["The Tracker"], year: 1998 }); +actsIn.save(WernerH, WhatDreamsMayCome, { roles: ["The Face"], year: 1998 }); + +var SnowFallingonCedars = movies.save({ _key: "SnowFallingonCedars", title: "Snow Falling on Cedars", released: 1999, tagline: "First loves last. Forever." })._id; +var EthanH = actors.save({ _key: "EthanH", name: "Ethan Hawke", born: 1970 })._id; +var RickY = actors.save({ _key: "RickY", name: "Rick Yune", born: 1971 })._id; +var JamesC = actors.save({ _key: "JamesC", name: "James Cromwell", born: 1940 })._id; +actsIn.save(EthanH, SnowFallingonCedars, { roles: ["Ishmael Chambers"], year: 1999 }); +actsIn.save(RickY, SnowFallingonCedars, { roles: ["Kazuo Miyamoto"], year: 1999 }); +actsIn.save(MaxS, SnowFallingonCedars, { roles: ["Nels Gudmundsson"], year: 1999 }); +actsIn.save(JamesC, SnowFallingonCedars, { roles: ["Judge Fielding"], year: 1999 }); + +var YouveGotMail = movies.save({ _key: "YouveGotMail", title: "You've Got Mail", released: 1998, tagline: "At odds in life... in love on-line." })._id; +var ParkerP = actors.save({ _key: "ParkerP", name: "Parker Posey", born: 1968 })._id; +var DaveC = actors.save({ _key: "DaveC", name: "Dave Chappelle", born: 1973 })._id; +var SteveZ = actors.save({ _key: "SteveZ", name: "Steve Zahn", born: 1967 })._id; +var TomH = actors.save({ _key: "TomH", name: "Tom Hanks", born: 1956 })._id; +actsIn.save(TomH, YouveGotMail, { roles: ["Joe Fox"], year: 1998 }); +actsIn.save(MegR, YouveGotMail, { roles: ["Kathleen Kelly"], year: 1998 }); +actsIn.save(GregK, YouveGotMail, { roles: ["Frank Navasky"], year: 1998 }); +actsIn.save(ParkerP, YouveGotMail, { roles: ["Patricia Eden"], year: 1998 }); +actsIn.save(DaveC, YouveGotMail, { roles: ["Kevin Jackson"], year: 1998 }); +actsIn.save(SteveZ, YouveGotMail, { roles: ["George Pappas"], year: 1998 }); + +var SleeplessInSeattle = movies.save({ _key: "SleeplessInSeattle", title: "Sleepless in Seattle", released: 1993, tagline: "What if someone you never met, someone you never saw, someone you never knew was the only someone for you?" })._id; +var RitaW = actors.save({ _key: "RitaW", name: "Rita Wilson", born: 1956 })._id; +var BillPull = actors.save({ _key: "BillPull", name: "Bill Pullman", born: 1953 })._id; +var VictorG = actors.save({ _key: "VictorG", name: "Victor Garber", born: 1949 })._id; +var RosieO = actors.save({ _key: "RosieO", name: "Rosie O'Donnell", born: 1962 })._id; +actsIn.save(TomH, SleeplessInSeattle, { roles: ["Sam Baldwin"], year: 1993 }); +actsIn.save(MegR, SleeplessInSeattle, { roles: ["Annie Reed"], year: 1993 }); +actsIn.save(RitaW, SleeplessInSeattle, { roles: ["Suzy"], year: 1993 }); +actsIn.save(BillPull, SleeplessInSeattle, { roles: ["Walter"], year: 1993 }); +actsIn.save(VictorG, SleeplessInSeattle, { roles: ["Greg"], year: 1993 }); +actsIn.save(RosieO, SleeplessInSeattle, { roles: ["Becky"], year: 1993 }); + +var JoeVersustheVolcano = movies.save({ _key: "JoeVersustheVolcano", title: "Joe Versus the Volcano", released: 1990, tagline: "A story of love, lava and burning desire." })._id; +var Nathan = actors.save({ _key: "Nathan", name: "Nathan Lane", born: 1956 })._id; +actsIn.save(TomH, JoeVersustheVolcano, { roles: ["Joe Banks"], year: 1990 }); +actsIn.save(MegR, JoeVersustheVolcano, { roles: ["DeDe", "Angelica Graynamore", "Patricia Graynamore"], year: 1990 }); +actsIn.save(Nathan, JoeVersustheVolcano, { roles: ["Baw"], year: 1990 }); + +var WhenHarryMetSally = movies.save({ _key: "WhenHarryMetSally", title: "When Harry Met Sally", released: 1998, tagline: "At odds in life... in love on-line." })._id; +var BillyC = actors.save({ _key: "BillyC", name: "Billy Crystal", born: 1948 })._id; +var CarrieF = actors.save({ _key: "CarrieF", name: "Carrie Fisher", born: 1956 })._id; +var BrunoK = actors.save({ _key: "BrunoK", name: "Bruno Kirby", born: 1949 })._id; +actsIn.save(BillyC, WhenHarryMetSally, { roles: ["Harry Burns"], year: 1998 }); +actsIn.save(MegR, WhenHarryMetSally, { roles: ["Sally Albright"], year: 1998 }); +actsIn.save(CarrieF, WhenHarryMetSally, { roles: ["Marie"], year: 1998 }); +actsIn.save(BrunoK, WhenHarryMetSally, { roles: ["Jess"], year: 1998 }); +``` + +## Example queries + +### All actors who acted in "movie1" OR "movie2" + +Say we want to find all actors who acted in "TheMatrix" OR "TheDevilsAdvocate". +First lets try to get all actors for one movie: + +```js +db._query(` + FOR x IN ANY 'movies/TheMatrix' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN x._id +`).toArray(); +``` + +Result: + +```json +[ + [ + "actors/Keanu", + "actors/Hugo", + "actors/Emil", + "actors/Carrie", + "actors/Laurence" + ] +] +``` + +Now we continue to form a `UNION_DISTINCT` of two neighbor queries which will +be the solution: + +```js +db._query(` + FOR x IN UNION_DISTINCT( + (FOR y IN ANY 'movies/TheMatrix' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN y._id), + (FOR y IN ANY 'movies/TheDevilsAdvocate' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN y._id) + ) RETURN x +`).toArray(); +``` + +```json +[ + [ + "actors/Emil", + "actors/Hugo", + "actors/Carrie", + "actors/Laurence", + "actors/Keanu", + "actors/Al", + "actors/Charlize" + ] +] +``` + +### All actors who acted in both "movie1" AND "movie2" + +This is almost identical to the question above. +But this time we are not interested in a `UNION` but in an `INTERSECTION`: + +```js +db._query(` + FOR x IN INTERSECTION( + (FOR y IN ANY 'movies/TheMatrix' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN y._id), + (FOR y IN ANY 'movies/TheDevilsAdvocate' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN y._id) + ) RETURN x +`).toArray(); +``` + +```json +[ + [ + "actors/Keanu" + ] +] +``` + +### All common movies between "actor1" and "actor2" + +This is actually identical to the question about common actors in movie1 and +movie2. We just have to change the starting vertices. As an example let us find +all movies where Hugo Weaving and Keanu Reeves are co-starring: + +```js +db._query(` + FOR x IN INTERSECTION( + (FOR y IN ANY 'actors/Hugo' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN y._id), + (FOR y IN ANY 'actors/Keanu' actsIn + OPTIONS { order: 'bfs', uniqueVertices: 'global' } + RETURN y._id) + ) RETURN x +`).toArray(); +``` + +```json +[ + [ + "movies/TheMatrixRevolutions", + "movies/TheMatrixReloaded", + "movies/TheMatrix" + ] +] +``` + +### All actors who acted in 3 or more movies + +Will make use of the edge index and the `COLLECT` statement of AQL for +grouping. The basic idea is to group all edges by their start vertex +(which in this dataset is always the actor). Then we remove all actors with +less than 3 movies from the result. Below query also returns the computed +number of movies an actor has acted in: + +```js +db._query(` + FOR x IN actsIn + COLLECT actor = x._from WITH COUNT INTO counter + FILTER counter >= 3 + RETURN { actor: actor, movies: counter } +`).toArray(); +``` + +```json +[ + { + "actor" : "actors/Carrie", + "movies" : 3 + }, + { + "actor" : "actors/CubaG", + "movies" : 4 + }, + { + "actor" : "actors/Hugo", + "movies" : 3 + }, + { + "actor" : "actors/Keanu", + "movies" : 4 + }, + { + "actor" : "actors/Laurence", + "movies" : 3 + }, + { + "actor" : "actors/MegR", + "movies" : 5 + }, + { + "actor" : "actors/TomC", + "movies" : 3 + }, + { + "actor" : "actors/TomH", + "movies" : 3 + } +] +``` + +### All movies where exactly 6 actors acted in + +The same idea as in the query before, but with equality filter, however now we +need the movie instead of the actor, so we return the `_to` attribute: + +```js +db._query(` + FOR x IN actsIn + COLLECT movie = x._to WITH COUNT INTO counter + FILTER counter == 6 + RETURN movie +`).toArray(); +``` + +```json +[ + "movies/SleeplessInSeattle", + "movies/TopGun", + "movies/YouveGotMail" +] +``` + +### The number of actors by movie + +We remember in our dataset `_to` on the edge corresponds to the movie, so we +count how often the same `_to` appears. This is the number of actors. The query +is almost identical to the ones before but without the `FILTER` after `COLLECT`: + +```js +db._query(` + FOR x IN actsIn + COLLECT movie = x._to WITH COUNT INTO counter + RETURN { movie: movie, actors: counter } +`).toArray(); +``` + +```json +[ + { + "movie" : "movies/AFewGoodMen", + "actors" : 11 + }, + { + "movie" : "movies/AsGoodAsItGets", + "actors" : 4 + }, + { + "movie" : "movies/JerryMaguire", + "actors" : 9 + }, + { + "movie" : "movies/JoeVersustheVolcano", + "actors" : 3 + }, + { + "movie" : "movies/SleeplessInSeattle", + "actors" : 6 + }, + { + "movie" : "movies/SnowFallingonCedars", + "actors" : 4 + }, + { + "movie" : "movies/StandByMe", + "actors" : 7 + }, + { + "movie" : "movies/TheDevilsAdvocate", + "actors" : 3 + }, + { + "movie" : "movies/TheMatrix", + "actors" : 5 + }, + { + "movie" : "movies/TheMatrixReloaded", + "actors" : 4 + }, + { + "movie" : "movies/TheMatrixRevolutions", + "actors" : 4 + }, + { + "movie" : "movies/TopGun", + "actors" : 6 + }, + { + "movie" : "movies/WhatDreamsMayCome", + "actors" : 5 + }, + { + "movie" : "movies/WhenHarryMetSally", + "actors" : 4 + }, + { + "movie" : "movies/YouveGotMail", + "actors" : 6 + } +] +``` + +### The number of movies by actor + +The `_to` attribute on the edge corresponds to the actor, so we group by it and +count with `COLLECT`. As a bonus, we can add sorting to return the actors with +the most movies first: + +```js +db._query(` + FOR x IN actsIn + COLLECT actor = x._from WITH COUNT INTO counter + SORT counter DESC + RETURN { actor: actor, movies: counter } +`).toArray(); +``` + +```json +[ + { + "actor" : "actors/MegR", + "movies" : 5 + }, + { + "actor" : "actors/Keanu", + "movies" : 4 + }, + { + "actor" : "actors/CubaG", + "movies" : 4 + }, + { + "actor" : "actors/Carrie", + "movies" : 3 + }, + { + "actor" : "actors/Laurence", + "movies" : 3 + }, + { + "actor" : "actors/Hugo", + "movies" : 3 + }, + { + "actor" : "actors/TomC", + "movies" : 3 + }, + { + "actor" : "actors/TomH", + "movies" : 3 + }, + { + "actor" : "actors/JerryO", + "movies" : 2 + }, + { + "actor" : "actors/GregK", + "movies" : 2 + }, + { + "actor" : "actors/MaxS", + "movies" : 2 + }, + { + "actor" : "actors/JackN", + "movies" : 2 + }, + { + "actor" : "actors/KieferS", + "movies" : 2 + }, + { + "actor" : "actors/JamesM", + "movies" : 1 + }, + { + "actor" : "actors/JayM", + "movies" : 1 + }, + { + "actor" : "actors/ReneeZ", + "movies" : 1 + }, + { + "actor" : "actors/JamesC", + "movies" : 1 + }, + { + "actor" : "actors/TomS", + "movies" : 1 + }, + { + "actor" : "actors/AnthonyE", + "movies" : 1 + }, + { + "actor" : "actors/ValK", + "movies" : 1 + }, + { + "actor" : "actors/KellyM", + "movies" : 1 + }, + { + "actor" : "actors/ChristopherG", + "movies" : 1 + }, + { + "actor" : "actors/Al", + "movies" : 1 + }, + { + "actor" : "actors/JTW", + "movies" : 1 + }, + { + "actor" : "actors/KevinP", + "movies" : 1 + }, + { + "actor" : "actors/Emil", + "movies" : 1 + }, + { + "actor" : "actors/NoahW", + "movies" : 1 + }, + { + "actor" : "actors/Charlize", + "movies" : 1 + }, + { + "actor" : "actors/KevinB", + "movies" : 1 + }, + { + "actor" : "actors/DemiM", + "movies" : 1 + }, + { + "actor" : "actors/WernerH", + "movies" : 1 + }, + { + "actor" : "actors/CarrieF", + "movies" : 1 + }, + { + "actor" : "actors/BillyC", + "movies" : 1 + }, + { + "actor" : "actors/Nathan", + "movies" : 1 + }, + { + "actor" : "actors/RosieO", + "movies" : 1 + }, + { + "actor" : "actors/VictorG", + "movies" : 1 + }, + { + "actor" : "actors/BillPull", + "movies" : 1 + }, + { + "actor" : "actors/RitaW", + "movies" : 1 + }, + { + "actor" : "actors/SteveZ", + "movies" : 1 + }, + { + "actor" : "actors/DaveC", + "movies" : 1 + }, + { + "actor" : "actors/ParkerP", + "movies" : 1 + }, + { + "actor" : "actors/RickY", + "movies" : 1 + }, + { + "actor" : "actors/EthanH", + "movies" : 1 + }, + { + "actor" : "actors/KellyP", + "movies" : 1 + }, + { + "actor" : "actors/AnnabellaS", + "movies" : 1 + }, + { + "actor" : "actors/Robin", + "movies" : 1 + }, + { + "actor" : "actors/HelenH", + "movies" : 1 + }, + { + "actor" : "actors/MarshallB", + "movies" : 1 + }, + { + "actor" : "actors/JohnC", + "movies" : 1 + }, + { + "actor" : "actors/CoreyF", + "movies" : 1 + }, + { + "actor" : "actors/RiverP", + "movies" : 1 + }, + { + "actor" : "actors/WilW", + "movies" : 1 + }, + { + "actor" : "actors/JonathanL", + "movies" : 1 + }, + { + "actor" : "actors/ReginaK", + "movies" : 1 + }, + { + "actor" : "actors/BonnieH", + "movies" : 1 + }, + { + "actor" : "actors/BrunoK", + "movies" : 1 + } +] +``` + +### The number of movies acted in between two years by actor + +This query is where a multi-model database actually shines. +First of all we want to use it in production, so we set a persistent index on year. +This allows as to execute fast range queries like between 1990 and 1995. + +```js +db.actsIn.ensureIndex({ type: "persistent", fields: ["year"] }); +``` + +Now we slightly modify our movies by actor query. + +```js +db._query(` + FOR x IN actsIn + FILTER x.year >= 1990 && x.year <= 1995 + COLLECT actor = x._from WITH COUNT INTO counter + RETURN { actor: actor, movies: counter } +`).toArray(); +``` + +```json +[ + { + "actor" : "actors/BillPull", + "movies" : 1 + }, + { + "actor" : "actors/ChristopherG", + "movies" : 1 + }, + { + "actor" : "actors/CubaG", + "movies" : 1 + }, + { + "actor" : "actors/DemiM", + "movies" : 1 + }, + { + "actor" : "actors/JackN", + "movies" : 1 + }, + { + "actor" : "actors/JamesM", + "movies" : 1 + }, + { + "actor" : "actors/JTW", + "movies" : 1 + }, + { + "actor" : "actors/KevinB", + "movies" : 1 + }, + { + "actor" : "actors/KevinP", + "movies" : 1 + }, + { + "actor" : "actors/KieferS", + "movies" : 1 + }, + { + "actor" : "actors/MegR", + "movies" : 2 + }, + { + "actor" : "actors/Nathan", + "movies" : 1 + }, + { + "actor" : "actors/NoahW", + "movies" : 1 + }, + { + "actor" : "actors/RitaW", + "movies" : 1 + }, + { + "actor" : "actors/RosieO", + "movies" : 1 + }, + { + "actor" : "actors/TomC", + "movies" : 1 + }, + { + "actor" : "actors/TomH", + "movies" : 2 + }, + { + "actor" : "actors/VictorG", + "movies" : 1 + } +] +``` + +### The years and number of movies by actor with actor name + +If we want to return a list of years and not just the amount of movies an actor +acted in, then we can't use `COLLECT WITH COUNT INTO` because we can only access +`actor` and `counter` after grouping. Instead, we can use `COLLECT … INTO` to +keep track of the movie years per actor. The amount of years equals the number +of movies. + +The example query is limited to two actors for simplicity. As an added extra, +it looks up the actor `name` using the `DOCUMENT()` function: + +```js +db._query(` + FOR x IN actsIn + FILTER x._from IN [ "actors/TomH", "actors/Keanu" ] + COLLECT actor = x._from INTO years = x.year + RETURN { + name: DOCUMENT(actor).name, + movies: COUNT(years), + years + }` +).toArray(); +``` + +```json +[ + { + "name" : "Keanu Reeves", + "movies" : 4, + "years" : [ + 1999, + 2003, + 2003, + 1997 + ] + }, + { + "name" : "Tom Hanks", + "movies" : 3, + "years" : [ + 1998, + 1993, + 1990 + ] + } +] +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/counting.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/counting.md new file mode 100644 index 0000000000..11079180c2 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/counting.md @@ -0,0 +1,28 @@ +--- +title: Counting in AQL +menuTitle: Counting +weight: 10 +description: >- + You can count the documents of a collection in different ways +--- +## Amount of documents in a collection + +To return the count of documents that currently exist in a collection, +you can call the [`LENGTH()` function](../functions/array.md#length): + +```aql +RETURN LENGTH(collection) +``` + +This type of call is optimized since 2.8 (no unnecessary intermediate result +is built up in memory) and it is therefore the preferred way to determine the count. +Internally, [`COLLECTION_COUNT()`](../functions/miscellaneous.md#collection_count) is called. + +In earlier versions with `COLLECT ... WITH COUNT INTO` available (since 2.4), +you may use the following code instead of `LENGTH()` for better performance: + +```aql +FOR doc IN collection + COLLECT WITH COUNT INTO length + RETURN length +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/create-test-data.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/create-test-data.md new file mode 100644 index 0000000000..90a27a2bd8 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/create-test-data.md @@ -0,0 +1,95 @@ +--- +title: Create Test Data with AQL +menuTitle: Create Test Data +weight: 5 +description: >- + How to fill a collection with dummy documents +--- +We assume that there is already a collection to the hold documents called +`myCollection` in below example queries. + +One of the easiest ways to fill a collection with test data is to use an AQL +query that iterates over a range. + +Run the following AQL query e.g. from the _AQL Editor_ in the web interface +to insert 1,000 documents into the collection: + +```aql +FOR i IN 1..1000 + INSERT { name: CONCAT("test", i) } IN myCollection +``` + +The number of documents to create can be modified easily be adjusting the range +boundary values. + +If you want to inspect the result immediately, add `RETURN NEW` at the end of +the query. + +To create more complex test data, adjust the AQL query. Let us say we also want +a `status` attribute, and fill it with integer values between `1` to `5` +(inclusive), with equal distribution. A good way to achieve this is to use +the modulo operator (`%`): + +```aql +FOR i IN 1..1000 + INSERT { + name: CONCAT("test", i), + status: 1 + (i % 5) + } IN myCollection +``` + +To create pseudo-random values, use the `RAND()` function. It creates +pseudo-random numbers between `0` and `1`. Use some factor to scale the random +numbers, and `FLOOR()` to convert the scaled number back to an integer. + +For example, the following query populates the `value` attribute with numbers +between 100 and 150 (inclusive): + +```aql +FOR i IN 1..1000 + INSERT { + name: CONCAT("test", i), + value: 100 + FLOOR(RAND() * (150 - 100 + 1)) + } IN myCollection +``` + +After the test data has been created, it is often helpful to verify it. The +`RAND()` function is also a good candidate for retrieving a random sample of +the documents in the collection. This query will retrieve 10 random documents: + +```aql +FOR doc IN myCollection + SORT RAND() + LIMIT 10 + RETURN doc +``` + +The `COLLECT` clause is an easy mechanism to run an aggregate analysis on some +attribute. Let us say we wanted to verify the data distribution inside the +`status` attribute. In this case we could run: + +```aql +FOR doc IN myCollection + COLLECT value = doc.value WITH COUNT INTO count + RETURN { + value: value, + count: count + } +``` + +The above query will provide the number of documents per distinct `value`. + +We can make the JSON result a bit more compact by using the value as attribute +key, the count as attribute value and merge everything into a single result +object. Note that attribute keys can only be strings, but for our purposes here +it is acceptable. + +```aql +RETURN MERGE( + FOR doc IN myCollection + COLLECT value = doc.value WITH COUNT INTO count + RETURN { + [value]: count + } +) +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/diffing-two-documents.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/diffing-two-documents.md new file mode 100644 index 0000000000..14dbc7d3d8 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/diffing-two-documents.md @@ -0,0 +1,128 @@ +--- +title: Diffing Two Documents in AQL +menuTitle: Diffing Two Documents +weight: 55 +description: >- + How to determine the differences in attributes of two documents +--- +There is no built-in AQL function to compare the attributes of two documents, +but it is easily possible to build a query that does: + +```aql +// input document 1 +LET doc1 = { + "foo": "bar", + "a": 1, + "b": 2 +} + +// input document 2 +LET doc2 = { + "foo": "baz", + "a": 2, + "c": 3 +} + +// collect attributes present in doc1, but missing in doc2 +LET missing = ( + FOR key IN ATTRIBUTES(doc1) + FILTER ! HAS(doc2, key) + RETURN { + [ key ]: doc1[key] + } +) + +// collect attributes present in both docs, but that have different values +LET changed = ( + FOR key IN ATTRIBUTES(doc1) + FILTER HAS(doc2, key) && doc1[key] != doc2[key] + RETURN { + [ key ] : { + old: doc1[key], + new: doc2[key] + } + } +) + +// collect attributes present in doc2, but missing in doc1 +LET added = ( + FOR key IN ATTRIBUTES(doc2) + FILTER ! HAS(doc1, key) + RETURN { + [ key ]: doc2[key] + } +) + +// return final result +RETURN { + "missing": missing, + "changed": changed, + "added": added +} +``` + +The query may look a bit lengthy, but much of that is due to formatting. +A more terse version can be found below. + +The above query will return a document with three attributes: + +- `missing`: + Contains all attributes only present in first document + (i.e. missing in second document) + +- `changed`: + Contains all attributes present in both documents that have different values + +- `added`: + Contains all attributes only present in second document + (i.e. missing in first document) + +For the two example documents it will return: + +```json +[ + { + "missing" : [ + { + "b" : 2 + } + ], + "changed" : [ + { + "foo" : { + "old" : "bar", + "new" : "baz" + } + }, + { + "a" : { + "old" : 1, + "new" : 2 + } + } + ], + "added" : [ + { + "c" : 3 + } + ] + } +] +``` + +You may adjust the query to produce a different output format. + +Following is a version of the same query that can be invoked from JavaScript +easily. It passes the two documents as bind parameters and calls `db._query`. +The query is now an one-liner (less readable but easier to copy & paste): + +```js +bindVariables = { + doc1 : { "foo" : "bar", "a" : 1, "b" : 2 }, + doc2 : { "foo" : "baz", "a" : 2, "c" : 3 } +}; + +query = "LET doc1 = @doc1, doc2 = @doc2, missing = (FOR key IN ATTRIBUTES(doc1) FILTER ! HAS(doc2, key) RETURN { [ key ]: doc1[key] }), changed = (FOR key IN ATTRIBUTES(doc1) FILTER HAS(doc2, key) && doc1[key] != doc2[key] RETURN { [ key ] : { old: doc1[key], new: doc2[key] } }), added = (FOR key IN ATTRIBUTES(doc2) FILTER ! HAS(doc1, key) RETURN { [ key ] : doc2[key] }) RETURN { missing : missing, changed : changed, added : added }"; + +result = db._query(query, bindVariables).toArray(); +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/dynamic-attribute-names.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/dynamic-attribute-names.md new file mode 100644 index 0000000000..59efe9f163 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/dynamic-attribute-names.md @@ -0,0 +1,202 @@ +--- +title: Dynamic Attribute Names in AQL +menuTitle: Dynamic Attribute Names +weight: 15 +description: >- + You can use expressions as attribute names or use subqueries and `ZIP()` to + create objects with varying attribute names +--- +You might want an AQL query to return results with attribute names assembled +by a function, or with a variable number of attributes. + +This will not work by specifying the result using a regular object literal, +as object literals require the names and numbers of attributes to be fixed at +query compile time. + +There are two solutions to getting dynamic attribute names to work: +- Using expressions as attribute names (fixed amount of attributes) +- Using subqueries and the `ZIP()` function (variable amount of attributes) + +## Using expressions as attribute names + +This solution works in cases where the number of dynamic attributes to return +is known in advance, and only the attribute names need to be calculated using +an expression. + +Using expressions as attribute names instead of fixed attribute names in object +literals requires enclosing the expression in extra `[` and `]` to disambiguate +them from regular, unquoted attribute names. + +Let us create a result that returns the original document data contained in +a dynamically named attribute. We will be using the expression `doc.type` +for the attribute name. We will also return some other attributes from the +original documents, but prefix them with the documents' `_key` attribute values. +For this we also need attribute name expressions. + +Here is a query showing how to do this. The attribute name expressions all +required to be enclosed in `[` and `]` in order to make this work: + +```aql +LET documents = [ + { "_key" : "3231748397810", "gender" : "f", "status" : "active", "type" : "user" }, + { "_key" : "3231754427122", "gender" : "m", "status" : "inactive", "type" : "unknown" } +] + +FOR doc IN documents + RETURN { + [ doc.type ] : { + [ CONCAT(doc._key, "_gender") ] : doc.gender, + [ CONCAT(doc._key, "_status") ] : doc.status + } + } +``` + +This will return: + +```json +[ + { + "user": { + "3231748397810_gender": "f", + "3231748397810_status": "active" + } + }, + { + "unknown": { + "3231754427122_gender": "m", + "3231754427122_status": "inactive" + } + } +] +``` + +Note: +Attribute name expressions and regular, unquoted attribute names can be mixed. + +## Subquery solution + +A generalized solution is to let a subquery or another function produce the +dynamic attribute names, and finally pass them through the `ZIP()` function to +create an object from them. + +Let us assume we want to process the following input documents: + +```json +{ "name": "test", "gender": "f", "status": "active", "type": "user" } +{ "name": "dummy", "gender": "m", "status": "inactive", "type": "unknown", "magicFlag": 23 } +``` + +Let us also assume our goal for each of these documents is to return only the +attribute names that contain the letter `a`, together with their respective +values. + +To extract the attribute names and values from the original documents, we can +use a subquery as follows: + +```aql +LET documents = [ + { "name": "test"," gender": "f", "status": "active", "type": "user" }, + { "name": "dummy", "gender": "m", "status": "inactive", "type": "unknown", "magicFlag": 23 } +] + +FOR doc IN documents + RETURN ( + FOR name IN ATTRIBUTES(doc) + FILTER LIKE(name, '%a%') + RETURN { + name: name, + value: doc[name] + } + ) +``` + +The subquery will only let attribute names pass that contain the letter `a`. +The results of the subquery are then made available to the main query and will +be returned. But the attribute names in the result are still `name` and `value`, +so we're not there yet. + +So let us also employ AQL's [`ZIP()`](../functions/document-object.md#zip) function, +which can create an object from two arrays: + +- the first parameter to `ZIP()` is an array with the attribute names +- the second parameter to `ZIP()` is an array with the attribute values + +Instead of directly returning the subquery result, we first capture it in a +variable, and pass the variable's `name` and `value` components into `ZIP()` +like this: + +```aql +LET documents = [ + { "name" : "test"," gender" : "f", "status" : "active", "type" : "user" }, + { "name" : "dummy", "gender" : "m", "status" : "inactive", "type" : "unknown", "magicFlag" : 23 } +] + +FOR doc IN documents + LET attributes = ( + FOR name IN ATTRIBUTES(doc) + FILTER LIKE(name, '%a%') + RETURN { + name: name, + value: doc[name] + } + ) + RETURN ZIP(attributes[*].name, attributes[*].value) +``` + +Note that we have to use the expansion operator (`[*]`) on `attributes` because +`attributes` itself is an array, and we want either the `name` attribute or the +`value` attribute of each of its members. + +To prove this is working, here is the above query's result: + +```json +[ + { + "name": "test", + "status": "active" + }, + { + "name": "dummy", + "status": "inactive", + "magicFlag": 23 + } +] +``` + +As can be seen, the two results have a different amount of result attributes. +We can also make the result a bit more dynamic by prefixing each attribute +with the value of the `name` attribute: + +```aql +LET documents = [ + { "name": "test"," gender": "f", "status": "active", "type": "user" }, + { "name": "dummy", "gender": "m", "status": "inactive", "type": "unknown", "magicFlag": 23 } +] + +FOR doc IN documents + LET attributes = ( + FOR name IN ATTRIBUTES(doc) + FILTER LIKE(name, '%a%') + RETURN { + name: CONCAT(doc.name, '-', name), + value: doc[name] + } + ) + RETURN ZIP(attributes[*].name, attributes[*].value) +``` + +That will give us document-specific attribute names like this: + +```json +[ + { + "test-name": "test", + "test-status": "active" + }, + { + "dummy-name": "dummy", + "dummy-status": "inactive", + "dummy-magicFlag": 23 + } +] +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/grouping.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/grouping.md new file mode 100644 index 0000000000..cb54fefcf2 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/grouping.md @@ -0,0 +1,359 @@ +--- +title: Grouping and aggregating data in AQL +menuTitle: Grouping +weight: 30 +description: >- + You can group data by arbitrary criteria with AQL's `COLLECT` operation, + with optional aggregation during grouping or using post-aggregation +--- +To group results by arbitrary criteria, AQL provides the `COLLECT` keyword. +`COLLECT` will perform a grouping, but no aggregation. Aggregation can still be +added in the query if required. + +## Ensuring uniqueness + +`COLLECT` can be used to make a result set unique. The following query will return each distinct +`age` attribute value only once: + +```aql +FOR u IN users + COLLECT age = u.age + RETURN age +``` + +This is grouping without tracking the group values, but just the group criterion (*age*) value. + +Grouping can also be done on multiple levels using `COLLECT`: + +```aql +FOR u IN users + COLLECT status = u.status, age = u.age + RETURN { status, age } +``` + +Alternatively `RETURN DISTINCT` can be used to make a result set unique. +`RETURN DISTINCT` supports a single criterion only: + +```aql +FOR u IN users + RETURN DISTINCT u.age +``` + +`RETURN DISTINCT` does not change the order of results. For above query that +means the order is undefined because no particular order is guaranteed when +iterating over a collection without explicit `SORT` operation. + +## Fetching group values + +To group users by age, and return the names of the users with the highest ages, +we'll issue a query like this: + +```aql +FOR u IN users + FILTER u.active == true + COLLECT age = u.age INTO usersByAge + SORT age DESC LIMIT 0, 5 + RETURN { + age, + users: usersByAge[*].u.name + } +``` + +```json +[ + { "age": 37, "users": [ "John", "Sophia" ] }, + { "age": 36, "users": [ "Fred", "Emma" ] }, + { "age": 34, "users": [ "Madison" ] }, + { "age": 33, "users": [ "Chloe", "Michael" ] }, + { "age": 32, "users": [ "Alexander" ] } +] +``` + +The query will put all users together by their *age* attribute. There will be one +result document per distinct *age* value (let aside the `LIMIT`). For each group, +we have access to the matching document via the *usersByAge* variable introduced in +the `COLLECT` statement. + +## Variable Expansion + +The *usersByAge* variable contains the full documents found, and as we're only +interested in user names, we'll use the expansion operator `[*]` to extract just the +*name* attribute of all user documents in each group: + +```aql +usersByAge[*].u.name +``` + +The `[*]` expansion operator is just a handy short-cut. We could also write +a subquery: + +```aql +( FOR temp IN usersByAge RETURN temp.u.name ) +``` + +## Grouping by multiple criteria + +To group by multiple criteria, we'll use multiple arguments in the `COLLECT` clause. +For example, to group users by *ageGroup* (a derived value we need to calculate first) +and then by *gender*, we'll do: + +```aql +FOR u IN users + FILTER u.active == true + COLLECT ageGroup = FLOOR(u.age / 5) * 5, + gender = u.gender INTO group + SORT ageGroup DESC + RETURN { + ageGroup, + gender + } +``` + +```json +[ + { "ageGroup": 35, "gender": "f" }, + { "ageGroup": 35, "gender": "m" }, + { "ageGroup": 30, "gender": "f" }, + { "ageGroup": 30, "gender": "m" }, + { "ageGroup": 25, "gender": "f" }, + { "ageGroup": 25, "gender": "m" } +] +``` + +## Counting group values + +If the goal is to count the number of values in each group, AQL provides the special +*COLLECT WITH COUNT INTO* syntax. This is a simple variant for grouping with an additional +group length calculation: + +```aql +FOR u IN users + FILTER u.active == true + COLLECT ageGroup = FLOOR(u.age / 5) * 5, + gender = u.gender WITH COUNT INTO numUsers + SORT ageGroup DESC + RETURN { + ageGroup, + gender, + numUsers + } +``` + +```json +[ + { "ageGroup": 35, "gender": "f", "numUsers": 2 }, + { "ageGroup": 35, "gender": "m", "numUsers": 2 }, + { "ageGroup": 30, "gender": "f", "numUsers": 4 }, + { "ageGroup": 30, "gender": "m", "numUsers": 4 }, + { "ageGroup": 25, "gender": "f", "numUsers": 2 }, + { "ageGroup": 25, "gender": "m", "numUsers": 2 } +] +``` + +## Aggregation + +Adding further aggregation is also simple in AQL by using an `AGGREGATE` clause +in the `COLLECT`: + +```aql +FOR u IN users + FILTER u.active == true + COLLECT ageGroup = FLOOR(u.age / 5) * 5, + gender = u.gender + AGGREGATE numUsers = LENGTH(1), + minAge = MIN(u.age), + maxAge = MAX(u.age) + SORT ageGroup DESC + RETURN { + ageGroup, + gender, + numUsers, + minAge, + maxAge + } +``` + +```json +[ + { + "ageGroup": 35, + "gender": "f", + "numUsers": 2, + "minAge": 36, + "maxAge": 39, + }, + { + "ageGroup": 35, + "gender": "m", + "numUsers": 2, + "minAge": 35, + "maxAge": 39, + }, + ... +] +``` + +We have used the aggregate functions *LENGTH* here (it returns the length of an array). +This is the equivalent to SQL's `SELECT g, COUNT(*) FROM ... GROUP BY g`. In addition to +`LENGTH`, AQL also provides `MAX`, `MIN`, `SUM` and `AVERAGE`, `VARIANCE_POPULATION`, +`VARIANCE_SAMPLE`, `STDDEV_POPULATION`, `STDDEV_SAMPLE`, `UNIQUE`, `SORTED_UNIQUE` and +`COUNT_UNIQUE` as basic aggregation functions. + +In AQL all aggregation functions can be run on arrays only. If an aggregation function +is run on anything that is not an array, a warning will be produced and the result will +be `null`. + +Using an `AGGREGATE` clause will ensure the aggregation is run while the groups are built +in the collect operation. This is normally more efficient than collecting all group values +for all groups and then doing a post-aggregation. + +## Post-aggregation + +Aggregation can also be performed after a `COLLECT` operation using other AQL constructs, +though performance-wise this is often inferior to using `COLLECT` with `AGGREGATE`. + +The same query as before can be turned into a post-aggregation query as shown below. Note +that this query will build and pass on all group values for all groups inside the variable +*g*, and perform the aggregation at the latest possible stage: + +```aql +FOR u IN users + FILTER u.active == true + COLLECT ageGroup = FLOOR(u.age / 5) * 5, + gender = u.gender INTO g + SORT ageGroup DESC + RETURN { + ageGroup, + gender, + numUsers: LENGTH(g[*]), + minAge: MIN(g[*].u.age), + maxAge: MAX(g[*].u.age) + } +``` + +```json +[ + { + "ageGroup": 35, + "gender": "f", + "numUsers": 2, + "minAge": 36, + "maxAge": 39, + }, + { + "ageGroup": 35, + "gender": "m", + "numUsers": 2, + "minAge": 35, + "maxAge": 39, + }, + ... +] +``` + +This is in contrast to the previous query that used an `AGGREGATE` clause to perform +the aggregation during the collect operation, at the earliest possible stage. + +## Post-filtering aggregated data + +To filter the results of a grouping or aggregation operation (i.e. something +similar to *HAVING* in SQL), simply add another `FILTER` clause after the `COLLECT` +statement. + +For example, to get the 3 *ageGroup*s with the most users in them: + +```aql +FOR u IN users + FILTER u.active == true + COLLECT ageGroup = FLOOR(u.age / 5) * 5 INTO group + LET numUsers = LENGTH(group) + FILTER numUsers > 2 /* group must contain at least 3 users in order to qualify */ + SORT numUsers DESC + LIMIT 0, 3 + RETURN { + "ageGroup": ageGroup, + "numUsers": numUsers, + "users": group[*].u.name + } +``` + +```json +[ + { + "ageGroup": 30, + "numUsers": 8, + "users": [ + "Abigail", + "Madison", + "Anthony", + "Alexander", + "Isabella", + "Chloe", + "Daniel", + "Michael" + ] + }, + { + "ageGroup": 25, + "numUsers": 4, + "users": [ + "Mary", + "Mariah", + "Jim", + "Diego" + ] + }, + { + "ageGroup": 35, + "numUsers": 4, + "users": [ + "Fred", + "John", + "Emma", + "Sophia" + ] + } +] +``` + +To increase readability, the repeated expression *LENGTH(group)* was put into a variable +*numUsers*. The `FILTER` on *numUsers* is the equivalent an SQL *HAVING* clause. + +## Aggregating data in local time + +If you store datetimes in UTC in your collections and need to group data for +each day in your local timezone, you can use `DATE_UTCTOLOCAL()` and +`DATE_TRUNC()` to adjust for that. + +Note: In the timezone `Europe/Berlin` daylight saving activated on 2020-03-29, +thus 2020-01-31T**23**:00:00Z is 2020-02-01 midnight in Germany and +2020-03-31T**22**:00:00Z is 2020-04-01 midnight in Germany. + +```aql +--- +name: aqlDateGroupingLocalTime_1 +description: '' +bindVars: + { + "activities": [ + {"startDate": "2020-01-31T23:00:00Z", "endDate": "2020-02-01T03:00:00Z", "duration": 4, "rate": 250}, + {"startDate": "2020-02-01T09:00:00Z", "endDate": "2020-02-01T17:00:00Z", "duration": 8, "rate": 250}, + {"startDate": "2020-03-31T21:00:00Z", "endDate": "2020-03-31T22:00:00Z", "duration": 1, "rate": 250}, + {"startDate": "2020-03-31T22:00:00Z", "endDate": "2020-04-01T03:00:00Z", "duration": 5, "rate": 250}, + {"startDate": "2020-04-01T13:00:00Z", "endDate": "2020-04-01T16:00:00Z", "duration": 3, "rate": 250} + ] + } +--- +FOR a IN @activities +COLLECT + day = DATE_TRUNC(DATE_UTCTOLOCAL(a.startDate, 'Europe/Berlin'), 'day') +AGGREGATE + hours = SUM(a.duration), + revenue = SUM(a.duration * a.rate) +SORT day ASC +RETURN { + day, + hours, + revenue +} +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/joins.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/joins.md new file mode 100644 index 0000000000..ae2ec7b2b5 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/joins.md @@ -0,0 +1,892 @@ +--- +title: Using Joins in AQL +menuTitle: Joins +weight: 25 +description: >- + Query examples for joining documents with one-to-many and many-to-many relationships +--- +The two common scenarios when you want to join documents of collections are: + +- **One-to-Many**: + You may have a `users` collection and a `cities` collection. A user lives in + a city and you need the city information during a query about the user. + +- **Many-To-Many**: + You may have a `authors` collection and a `books` collection. An author can write many + books and a book can have many authors. You want to return a list of books + with their authors. Therefore you need to join the authors and books. + +Unlike many NoSQL databases, ArangoDB does support joins in AQL queries. This +is similar to the way traditional relational databases handle this. However, +because documents allow for more flexibility, joins are also more flexible. +The following sections provide solutions for common questions. + +So far, we have only dealt with one collection (`users`) at a time. We also have a +collection `relations` that stores relationships between users. We now use +this extra collection to create a result from two collections. + +First of all, we query a few users together with their friends' IDs. For that, +we use all `relations` that have a value of `friend` in their `type` attribute. +Relationships are established by using the `friendOf` and `thisUser` attributes in the +`relations` collection, which point to the `userId` values in the `users` collection. + +## One-To-Many + +You have a collection called `users`. Users live in city and a city is identified +by its primary key. In principle you can embedded the city document into the +users document and be happy with it. + +```json +{ + "_id" : "users/2151975421", + "_key" : "2151975421", + "_rev" : "2151975421", + "name" : { + "first" : "John", + "last" : "Doe" + }, + "city" : { + "name" : "Metropolis" + } +} +``` + +This works well for many use cases. Now assume that you have additional +information about the city, like the number of people living in it. It would be +impractical to change each and every user document if this numbers changes. +Therefore it is good idea to hold the city information in a separate collection. + +```js +arangosh> db.cities.document("cities/2241300989"); +``` + +```json +{ + "population" : 1000, + "name" : "Metropolis", + "_id" : "cities/2241300989", + "_rev" : "2241300989", + "_key" : "2241300989" +} +``` + +Instead of embedding the city directly in the user document, you can use +the key of the city. + +```js +arangosh> db.users.document("users/2290649597"); +``` + +```json +{ + "name" : { + "first" : "John", + "last" : "Doe" + }, + "city" : "cities/2241300989", + "_id" : "users/2290649597", + "_rev" : "2290649597", + "_key" : "2290649597" +} +``` + +We can now join these two collections very easily. + +```js +arangosh> db._query( +........>"FOR u IN users " + +........>" FOR c IN cities " + +........>" FILTER u.city == c._id RETURN { user: u, city: c }" +........>).toArray() +``` + +```json +[ + { + "user" : { + "name" : { + "first" : "John", + "last" : "Doe" + }, + "city" : "cities/2241300989", + "_id" : "users/2290649597", + "_rev" : "2290649597", + "_key" : "2290649597" + }, + "city" : { + "population" : 1000, + "name" : "Metropolis", + "_id" : "cities/2241300989", + "_rev" : "2241300989", + "_key" : "2241300989" + } + } +] +``` + +Unlike in SQL, there is no special `JOIN` keyword. The optimizer ensures that the +primary index is used in the above query. + +However, very often it is much more convenient for the client of the query if a +single document would be returned, where the city information is embedded in the +user document - as in the simple example above. With AQL, you do not need +to forgo this simplification. + +```js +arangosh> db._query( +........>"FOR u IN users " + +........>" FOR c IN cities " + +........>" FILTER u.city == c._id RETURN merge(u, {city: c})" +........>).toArray() +``` + +```json +[ + { + "_id" : "users/2290649597", + "_key" : "2290649597", + "_rev" : "2290649597", + "name" : { + "first" : "John", + "last" : "Doe" + }, + "city" : { + "_id" : "cities/2241300989", + "_key" : "2241300989", + "_rev" : "2241300989", + "population" : 1000, + "name" : "Metropolis" + } + } +] +``` + +You can have both: the convenient representation of the result for your +client and the flexibility of joins for your data model. + +## Many-To-Many + +In the relational world, you need a third table to model the many-to-many +relation. In ArangoDB, you have a choice depending on the information you are +going to store and the type of questions you are going to ask. + +Assume that authors are stored in one collection and books in a second. If all +you need is "who are the authors of a book", then you can easily model this as +a list attribute in users. + +If you want to store more information, for example, which author wrote which +page in a conference proceeding, or if you also want to know "which books were +written by which author", you can use edge collections. This is very similar to +the "join table" from the relational world. + +### Embedded Lists + +If you only want to store the authors of a book, you can embed them as list in +the book document. There is no need for a separate collection. + +```js +arangosh> db.authors.toArray() +``` + +```json +[ + { + "_id" : "authors/2661190141", + "_key" : "2661190141", + "_rev" : "2661190141", + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + } + }, + { + "_id" : "authors/2658437629", + "_key" : "2658437629", + "_rev" : "2658437629", + "name" : { + "first" : "John", + "last" : "Doe" + } + } +] +``` + +You can query books: + +```js +arangosh> db._query("FOR b IN books RETURN b").toArray(); +``` + +```json +[ + { + "_id" : "books/2681506301", + "_key" : "2681506301", + "_rev" : "2681506301", + "title" : "The beauty of JOINS", + "authors" : [ + "authors/2661190141", + "authors/2658437629" + ] + } +] +``` + +And you can join the authors in a very similar manner given in the one-to-many section: + +```js +arangosh> db._query( +........>"FOR b IN books " + +........>" LET a = (FOR x IN b.authors " + +........>" FOR a IN authors FILTER x == a._id RETURN a) " + +........>" RETURN { book: b, authors: a }" +........>).toArray(); +``` + +```json +[ + { + "book" : { + "title" : "The beauty of JOINS", + "authors" : [ + "authors/2661190141", + "authors/2658437629" + ], + "_id" : "books/2681506301", + "_rev" : "2681506301", + "_key" : "2681506301" + }, + "authors" : [ + { + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + }, + "_id" : "authors/2661190141", + "_rev" : "2661190141", + "_key" : "2661190141" + }, + { + "name" : { + "first" : "John", + "last" : "Doe" + }, + "_id" : "authors/2658437629", + "_rev" : "2658437629", + "_key" : "2658437629" + } + ] + } +] +``` + +Or you can embed the authors directly: + +```js +arangosh> db._query( +........>"FOR b IN books LET a = (" + +........>" FOR x IN b.authors " + +........>" FOR a IN authors FILTER x == a._id RETURN a)" + +........>" RETURN merge(b, { authors: a })" +........>).toArray(); +``` + +```json +[ + { + "_id" : "books/2681506301", + "_key" : "2681506301", + "_rev" : "2681506301", + "title" : "The beauty of JOINS", + "authors" : [ + { + "_id" : "authors/2661190141", + "_key" : "2661190141", + "_rev" : "2661190141", + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + } + }, + { + "_id" : "authors/2658437629", + "_key" : "2658437629", + "_rev" : "2658437629", + "name" : { + "first" : "John", + "last" : "Doe" + } + } + ] + } +] +``` + +### Using Edge Collections + +If you also want to query which books are written by a given author, embedding authors +in the book document is possible, but it is more efficient to use a edge collections for +speed. + +Or you are publishing a proceeding, then you want to store the pages the author has written +as well. This information can be stored in the edge document. + +First off, create the users: + +```js +arangosh> db._create("authors"); +``` + +``` +[ArangoCollection 2926807549, "authors" (type document, status loaded)] +``` + +```js +arangosh> db.authors.save({ name: { first: "John", last: "Doe" } }) +``` + +```json +{ + "error" : false, + "_id" : "authors/2935261693", + "_rev" : "2935261693", + "_key" : "2935261693" +} +``` + +```js +arangosh> db.authors.save({ name: { first: "Maxima", last: "Musterfrau" } }) +``` + +```json +{ + "error" : false, + "_id" : "authors/2938210813", + "_rev" : "2938210813", + "_key" : "2938210813" +} +``` + +Now, create the books without any author information: + +```js +arangosh> db._create("books"); +``` + +``` +[ArangoCollection 2928380413, "books" (type document, status loaded)] +``` + +```js +arangosh> db.books.save({ title: "The beauty of JOINS" }); +``` + +```json +{ + "error" : false, + "_id" : "books/2980088317", + "_rev" : "2980088317", + "_key" : "2980088317" +} +``` + +An edge collection is now used to link authors and books: + +```js +arangosh> db._createEdgeCollection("written"); +``` + +``` +[ArangoCollection 2931132925, "written" (type edge, status loaded)] +``` + +```js +arangosh> db.written.save("authors/2935261693", +........>"books/2980088317", +........>{ pages: "1-10" }) +``` + +```json +{ + "error" : false, + "_id" : "written/3006237181", + "_rev" : "3006237181", + "_key" : "3006237181" +} +``` + +```js +arangosh> db.written.save("authors/2938210813", +........>"books/2980088317", +........>{ pages: "11-20" }) +``` + +```json +{ + "error" : false, + "_id" : "written/3012856317", + "_rev" : "3012856317", + "_key" : "3012856317" +} +``` + +In order to get all books with their authors, you can use a +[graph traversal](../graphs/traversals.md#working-with-collection-sets): + +```js +arangosh> db._query( +...> "FOR b IN books " + +...> "LET authorsByBook = ( " + +...> " FOR author, writtenBy IN INBOUND b written " + +...> " RETURN { " + +...> " vertex: author, " + +...> " edge: writtenBy " + +...> " } " + +...> ") " + +...> "RETURN { " + +...> " book: b, " + +...> " authors: authorsByBook " + +...> "} " +...> ).toArray(); +``` + +```json +[ + { + "book" : { + "_key" : "2980088317", + "_id" : "books/2980088317", + "_rev" : "2980088317", + "title" : "The beauty of JOINS" + }, + "authors" : [ + { + "vertex" : { + "_key" : "2935261693", + "_id" : "authors/2935261693", + "_rev" : "2935261693", + "name" : { + "first" : "John", + "last" : "Doe" + } + }, + "edge" : { + "_key" : "2935261693", + "_id" : "written/2935261693", + "_from" : "authors/2935261693", + "_to" : "books/2980088317", + "_rev" : "3006237181", + "pages" : "1-10" + } + }, + { + "vertex" : { + "_key" : "2938210813", + "_id" : "authors/2938210813", + "_rev" : "2938210813", + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + } + }, + "edge" : { + "_key" : "6833274", + "_id" : "written/6833274", + "_from" : "authors/2938210813", + "_to" : "books/2980088317", + "_rev" : "3012856317", + "pages" : "11-20" + } + } + ] + } +] +``` + +Or if you want only the information stored in the vertices, you can use this query: + +```js +arangosh> db._query( +...> "FOR b IN books " + +...> "LET authorsByBook = ( " + +...> " FOR author IN INBOUND b written " + +...> " OPTIONS { " + +...> " order: 'bfs', " + +...> " uniqueVertices: 'global' " + +...> " } " + +...> " RETURN author " + +...> ") " + +...> "RETURN { " + +...> " book: b, " + +...> " authors: authorsByBook " + +...> "} " +...> ).toArray(); +``` + +```json +[ + { + "book" : { + "_key" : "2980088317", + "_id" : "books/2980088317", + "_rev" : "2980088317", + "title" : "The beauty of JOINS" + }, + "authors" : [ + { + "_key" : "2938210813", + "_id" : "authors/2938210813", + "_rev" : "2938210813", + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + } + }, + { + "_key" : "2935261693", + "_id" : "authors/2935261693", + "_rev" : "2935261693", + "name" : { + "first" : "John", + "last" : "Doe" + } + } + ] + } +] +``` + +Or again embed the authors directly into the book document: + +```js +arangosh> db._query( +...> "FOR b IN books " + +...> "LET authors = ( " + +...> " FOR author IN INBOUND b written " + +...> " OPTIONS { " + +...> " order: 'bfs', " + +...> " uniqueVertices: 'global' " + +...> " } " + +...> " RETURN author " + +...> ") " + +...> "RETURN MERGE(b, {authors: authors}) " +...> ).toArray(); +``` + +```json +[ + { + "_id" : "books/2980088317", + "_key" : "2980088317", + "_rev" : "2980088317", + "title" : "The beauty of JOINS", + "authors" : [ + { + "_key" : "2938210813", + "_id" : "authors/2938210813", + "_rev" : "2938210813", + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + } + }, + { + "_key" : "2935261693", + "_id" : "authors/2935261693", + "_rev" : "2935261693", + "name" : { + "first" : "John", + "last" : "Doe" + } + } + ] + } +] +``` + +If you need the authors and their books, simply reverse the direction: + +```js +> db._query( +...> "FOR a IN authors " + +...> "LET booksByAuthor = ( " + +...> " FOR b IN OUTBOUND a written " + +...> " OPTIONS { " + +...> " order: 'bfs', " + +...> " uniqueVertices: 'global' " + +...> " } " + +...> " RETURN b" + +...> ") " + +...> "RETURN MERGE(a, {books: booksByAuthor}) " +...> ).toArray(); +``` + +```json +[ + { + "_id" : "authors/2935261693", + "_key" : "2935261693", + "_rev" : "2935261693", + "name" : { + "first" : "John", + "last" : "Doe" + }, + "books" : [ + { + "_key" : "2980088317", + "_id" : "books/2980088317", + "_rev" : "2980088317", + "title" : "The beauty of JOINS" + } + ] + }, + { + "_id" : "authors/2938210813", + "_key" : "2938210813", + "_rev" : "2938210813", + "name" : { + "first" : "Maxima", + "last" : "Musterfrau" + }, + "books" : [ + { + "_key" : "2980088317", + "_id" : "books/2980088317", + "_rev" : "2980088317", + "title" : "The beauty of JOINS" + } + ] + } +] +``` + +## More examples + +### Join tuples + +We will start with a SQL-ish result set and return each tuple (user name, friends userId) +separately. The AQL query to generate such result is: + +```aql +--- +name: joinTuples +description: '' +dataset: joinSampleDataset +bindVars: + { + "friend": "friend" + } +--- +FOR u IN users + FILTER u.active == true + LIMIT 0, 4 + FOR f IN relations + FILTER f.type == @friend && f.friendOf == u.userId + RETURN { + "user" : u.name, + "friendId" : f.thisUser + } +``` + +We iterate over the collection users. Only the 'active' users will be examined. +For each of these users we will search for up to 4 friends. We locate friends +by comparing the `userId` of our current user with the `friendOf` attribute of the +`relations` document. For each of those relations found we return the users name +and the userId of the friend. + +### Horizontal lists + +Note that in the above result, a user can be returned multiple times. This is the +SQL way of returning data. If this is not desired, the friends' ids of each user +can be returned in a horizontal list. This will return each user at most once. + +The AQL query for doing so is: + +```aql +FOR u IN users + FILTER u.active == true LIMIT 0, 4 + RETURN { + "user" : u.name, + "friendIds" : ( + FOR f IN relations + FILTER f.friendOf == u.userId && f.type == "friend" + RETURN f.thisUser + ) + } +``` + +```json +[ + { + "user" : "Abigail", + "friendIds" : [ + 108, + 102, + 106 + ] + }, + { + "user" : "Fred", + "friendIds" : [ + 209 + ] + }, + { + "user" : "Mary", + "friendIds" : [ + 207, + 104 + ] + }, + { + "user" : "Mariah", + "friendIds" : [ + 203, + 205 + ] + } +] +``` + +In this query we are still iterating over the users in the `users` collection +and for each matching user we are executing a subquery to create the matching +list of related users. + +### Self joins + +To not only return friend ids but also the names of friends, we could "join" the +`users` collection once more (something like a "self join"): + +```aql +FOR u IN users + FILTER u.active == true + LIMIT 0, 4 + RETURN { + "user" : u.name, + "friendIds" : ( + FOR f IN relations + FILTER f.friendOf == u.userId && f.type == "friend" + FOR u2 IN users + FILTER f.thisUser == u2.useId + RETURN u2.name + ) + } +``` + +```json +[ + { + "user" : "Abigail", + "friendIds" : [ + "Jim", + "Jacob", + "Daniel" + ] + }, + { + "user" : "Fred", + "friendIds" : [ + "Mariah" + ] + }, + { + "user" : "Mary", + "friendIds" : [ + "Isabella", + "Michael" + ] + }, + { + "user" : "Mariah", + "friendIds" : [ + "Madison", + "Eva" + ] + } +] +``` + +This query will then again in term fetch the clear text name of the +friend from the users collection. So here we iterate the users collection, +and for each hit the relations collection, and for each hit once more the +users collection. + +### Outer joins + +Lets find the lonely people in our database - those without friends. + +```aql +FOR user IN users + LET friendList = ( + FOR f IN relations + FILTER f.friendOf == u.userId + RETURN 1 + ) + FILTER LENGTH(friendList) == 0 + RETURN { "user" : user.name } +``` + +```json +[ + { + "user" : "Abigail" + }, + { + "user" : "Fred" + } +] +``` + +So, for each user we pick the list of their friends and count them. The ones where +count equals zero are the lonely people. Using `RETURN 1` in the subquery +saves even more precious CPU cycles and gives the optimizer more alternatives. + +### Index usage + +For joins in particular, you should make sure indexes can be utilized to +[speed up your queries](../execution-and-performance/explaining-queries.md). + +Note that sparse indexes don't qualify for joins. Often, You also want to join +documents not containing the property you join with. However, sparse indexes +don't contain references to documents that don't contain the indexed +attributes - thus they would be missing from the join operation. For this reason, +you should provide non-sparse indexes. + +### Pitfalls + +Since we're free of schemata, there is by default no way to tell the format of the +documents. So, if your documents don't contain an attribute, it defaults to +null. We can however check our data for accuracy like this: + +```aql +RETURN LENGTH(FOR u IN users FILTER u.userId == null RETURN 1) +``` + +```json +[ + 10000 +] +``` + +```aql +RETURN LENGTH(FOR f IN relations FILTER f.friendOf == null RETURN 1) +``` + +```json +[ + 10000 +] +``` + +So if the above queries return 10k matches each, the result of the Join tuples +query will become 100,000,000 items larger and use much memory plus computation +time. So it is generally a good idea to revalidate that the criteria for your +join conditions exist. + +Using indexes on the properties can speed up the operation significantly. +You can use the explain helper to revalidate your query actually uses them. + +If you work with joins on edge collections you would typically aggregate over +the internal fields `_id`, `_from` and `_to` (where `_id` equals `userId`, +`_from` `friendOf` and `_to` would be `thisUser` in our examples). ArangoDB +implicitly creates indexes on them. diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/projections-and-filters.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/projections-and-filters.md new file mode 100644 index 0000000000..18284c1362 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/projections-and-filters.md @@ -0,0 +1,136 @@ +--- +title: Projections and Filters in AQL +menuTitle: Projections and filters +weight: 20 +description: >- + Examples of returning documents unaltered and subsets of their attributes, + as well as filtering +--- +## Returning unaltered documents + +To return three complete documents from collection *users*, the following query can be used: + +```aql +FOR u IN users + LIMIT 0, 3 + RETURN u +``` + +```json +[ + { + "_id" : "users/229886047207520", + "_rev" : "229886047207520", + "_key" : "229886047207520", + "active" : true, + "id" : 206, + "age" : 31, + "gender" : "f", + "name" : "Abigail" + }, + { + "_id" : "users/229886045175904", + "_rev" : "229886045175904", + "_key" : "229886045175904", + "active" : true, + "id" : 101, + "age" : 36, + "name" : "Fred", + "gender" : "m" + }, + { + "_id" : "users/229886047469664", + "_rev" : "229886047469664", + "_key" : "229886047469664", + "active" : true, + "id" : 208, + "age" : 29, + "name" : "Mary", + "gender" : "f" + } +] +``` + +Note that there is a `LIMIT` clause but no `SORT` clause. In this case it is not guaranteed +which of the user documents are returned. Effectively the document return order is unspecified +if no `SORT` clause is used, and you should not rely on the order in such queries. + +## Projections + +To return a projection from the collection *users* use a modified `RETURN` instruction: + +```aql +FOR u IN users + LIMIT 0, 3 + RETURN { + "user" : { + "isActive" : u.active ? "yes" : "no", + "name" : u.name + } + } +``` + +```json +[ + { + "user" : { + "isActive" : "yes", + "name" : "John" + } + }, + { + "user" : { + "isActive" : "yes", + "name" : "Anthony" + } + }, + { + "user" : { + "isActive" : "yes", + "name" : "Fred" + } + } +] +``` + +## Filters + +To return a filtered projection from collection *users*, you can use the +`FILTER` keyword. Additionally, a `SORT` clause is used to have the result +returned in a specific order: + +```aql +FOR u IN users + FILTER u.active == true && u.age >= 30 + SORT u.age DESC + LIMIT 0, 5 + RETURN { + "age" : u.age, + "name" : u.name + } +``` + +```json +[ + { + "age" : 37, + "name" : "Sophia" + }, + { + "age" : 37, + "name" : "John" + }, + { + "age" : 36, + "name" : "Emma" + }, + { + "age" : 36, + "name" : "Fred" + }, + { + "age" : 34, + "name" : "Madison" + } +] +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/queries-without-collections.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/queries-without-collections.md new file mode 100644 index 0000000000..3e1dcd8225 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/queries-without-collections.md @@ -0,0 +1,50 @@ +--- +title: AQL queries without collections +menuTitle: Queries without collections +weight: 50 +description: >- + You can use AQL with only expressions and no document access for some + calculation and testing purposes +--- +AQL queries typically access one or more collections to read from documents +or to modify them. Queries don't necessarily have to involve collections +however. Below are a few examples of that. + +Following is a query that returns a string value. The result string is contained in an array +because the result of every valid query is an array: + +```aql +--- +name: aqlWithoutCollections_1 +description: '' +--- +RETURN "this will be returned" +``` + +You may use variables, call functions and return arbitrarily structured results: + +```aql +--- +name: aqlWithoutCollections_2 +description: '' +--- +LET array = [1, 2, 3, 4] +RETURN { array, sum: SUM(array) } +``` + +Language constructs such as the FOR loop can be used too. Below query +creates the Cartesian product of two arrays and concatenates the value pairs: + +```aql +--- +name: aqlWithoutCollections_3 +description: '' +--- +FOR year IN [ 2011, 2012, 2013 ] + FOR quarter IN [ 1, 2, 3, 4 ] + RETURN { + year, + quarter, + formatted: CONCAT(quarter, " / ", year) + } +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/remove-vertex.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/remove-vertex.md new file mode 100644 index 0000000000..e80cf8f390 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/remove-vertex.md @@ -0,0 +1,81 @@ +--- +title: Remove vertices with AQL +menuTitle: Remove vertex +weight: 45 +description: >- + Removing connected edges along with vertex documents directly in AQL is + possible in a limited way +--- +Deleting vertices with associated edges is currently not handled via AQL while +the [graph management interface](../../graphs/general-graphs/management.md#remove-a-vertex) +and the +[REST API for the graph module](../../develop/http-api/graphs/named-graphs.md#remove-a-vertex) +offer a vertex deletion functionality. +However, as shown in this example based on the +[Knows Graph](../../graphs/example-graphs.md#knows-graph), a query for this +use case can be created. + +![Example Graph](../../../../images/knows_graph.png) + +When deleting vertex **eve** from the graph, we also want the edges +`eve -> alice` and `eve -> bob` to be removed. +The involved graph and its only edge collection has to be known. In this case it +is the graph **knows_graph** and the edge collection **knows**. + +This query will delete **eve** with its adjacent edges: + +```aql +--- +name: GRAPHTRAV_removeVertex1 +description: '' +dataset: knows_graph +--- +LET edgeKeys = (FOR v, e IN 1..1 ANY 'persons/eve' GRAPH 'knows_graph' RETURN e._key) +LET r = (FOR key IN edgeKeys REMOVE key IN knows) +REMOVE 'eve' IN persons +``` + +This query executed several actions: +- use a graph traversal of depth 1 to get the `_key` of **eve's** adjacent edges +- remove all of these edges from the `knows` collection +- remove vertex **eve** from the `persons` collection + +The following query shows a different design to achieve the same result: + +```aql +--- +name: GRAPHTRAV_removeVertex2 +description: '' +dataset: knows_graph +--- +LET edgeKeys = (FOR v, e IN 1..1 ANY 'persons/eve' GRAPH 'knows_graph' + REMOVE e._key IN knows) +REMOVE 'eve' IN persons +``` + +**Note**: The query has to be adjusted to match a graph with multiple vertex/edge collections. + +For example, the [City Graph](../../graphs/example-graphs.md#city-graph) +contains several vertex collections - `germanCity` and `frenchCity` and several +edge collections - `french / german / international Highway`. + +![Example Graph2](../../../../images/cities_graph.png) + +To delete city **Berlin** all edge collections `french / german / international Highway` +have to be considered. The **REMOVE** operation has to be applied on all edge +collections with `OPTIONS { ignoreErrors: true }`. Not using this option will stop the query +whenever a non existing key should be removed in a collection. + +```aql +--- +name: GRAPHTRAV_removeVertex3 +description: '' +dataset: routeplanner +--- +LET edgeKeys = (FOR v, e IN 1..1 ANY 'germanCity/Berlin' GRAPH 'routeplanner' RETURN e._key) +LET r = (FOR key IN edgeKeys REMOVE key IN internationalHighway + OPTIONS { ignoreErrors: true } REMOVE key IN germanHighway + OPTIONS { ignoreErrors: true } REMOVE key IN frenchHighway + OPTIONS { ignoreErrors: true }) +REMOVE 'Berlin' IN germanCity +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/traversals.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/traversals.md new file mode 100644 index 0000000000..08296c64e4 --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/traversals.md @@ -0,0 +1,118 @@ +--- +title: Combining AQL Graph Traversals +menuTitle: Traversals +weight: 40 +description: >- + You can combine graph queries with other AQL features like geo-spatial search +--- +## Finding the start vertex via a geo query + +Our first example will locate the start vertex for a graph traversal via [a geo index](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md). +We use the [City Graph](../../graphs/example-graphs.md#city-graph) and its geo indexes: + +![Cities Example Graph](../../../../images/cities_graph.png) + +```js +--- +name: COMBINING_GRAPH_01_create_graph +description: '' +--- +var examples = require("@arangodb/graph-examples/example-graph"); +examples.loadGraph("routeplanner"); +~examples.dropGraph("routeplanner"); +``` + +We search all german cities in a range of 400 km around the ex-capital **Bonn**: **Hamburg** and **Cologne**. +We won't find **Paris** since its in the `frenchCity` collection. + +```aql +--- +name: COMBINING_GRAPH_02_show_geo +description: '' +dataset: routeplanner +bindVars: + { + "bonn": [7.0998, 50.7340], + "radius": 400000 + } +--- +FOR startCity IN germanCity + FILTER GEO_DISTANCE(@bonn, startCity.geometry) < @radius + RETURN startCity._key +``` + +Let's revalidate that the geo indexes are actually used: + +```aql +--- +name: COMBINING_GRAPH_03_explain_geo +description: '' +dataset: routeplanner +explain: true +bindVars: + { + "bonn": [7.0998, 50.7340], + "radius": 400000 + } +--- +FOR startCity IN germanCity + FILTER GEO_DISTANCE(@bonn, startCity.geometry) < @radius + RETURN startCity._key +``` + +And now combine this with a graph traversal: + +```aql +--- +name: COMBINING_GRAPH_04_combine +description: '' +dataset: routeplanner +bindVars: + { + "bonn": [7.0998, 50.7340], + "radius": 400000 + } +--- +FOR startCity IN germanCity + FILTER GEO_DISTANCE(@bonn, startCity.geometry) < @radius + FOR v, e, p IN 1..1 OUTBOUND startCity + GRAPH 'routeplanner' + RETURN {startcity: startCity._key, traversedCity: v._key} +``` + +The geo index query returns us `startCity` (**Cologne** and **Hamburg**) which we then use as starting point for our graph traversal. +For simplicity we only return their direct neighbours. We format the return result so we can see from which `startCity` the traversal came. + +Alternatively we could use a `LET` statement with a subquery to group the traversals by their `startCity` efficiently: + +```aql +--- +name: COMBINING_GRAPH_05_combine_let +description: '' +dataset: routeplanner +bindVars: + { + "bonn": [7.0998, 50.7340], + "radius": 400000 + } +--- +FOR startCity IN germanCity + FILTER GEO_DISTANCE(@bonn, startCity.geometry) < @radius + LET oneCity = ( + FOR v, e, p IN 1..1 OUTBOUND startCity + GRAPH 'routeplanner' RETURN v._key + ) + RETURN {startCity: startCity._key, connectedCities: oneCity} +``` + +Finally, we clean up again: + +```js +--- +name: COMBINING_GRAPH_06_cleanup +description: '' +--- +~var examples = require("@arangodb/graph-examples/example-graph"); +~var g = examples.loadGraph("routeplanner"); +examples.dropGraph("routeplanner"); +``` diff --git a/site/content/arangodb/oem/aql/examples-and-query-patterns/upsert-repsert-guide.md b/site/content/arangodb/oem/aql/examples-and-query-patterns/upsert-repsert-guide.md new file mode 100644 index 0000000000..7068acebbe --- /dev/null +++ b/site/content/arangodb/oem/aql/examples-and-query-patterns/upsert-repsert-guide.md @@ -0,0 +1,335 @@ +--- +title: Conditionally Inserting and Modifying Documents +menuTitle: Upsert / Repsert Guide +weight: 60 +description: >- + AQL offers an `UPSERT` operation and an `INSERT` operation with different + overwrite modes, and you can alternatively use the Document API, each having + different features and performance characteristics +--- +A common requirement when ingesting data is to ensure that certain documents +exist in a collection. Oftentimes when running a command it is unclear whether +the target documents are already present in the collection or need to be +inserted first. + +Unconditional `INSERT` operations will not work here, because they may run +into errors if the target documents already exist. This will trigger a +"unique constraint violation" error. Unconditional `UPDATE` or `REPLACE` +operations will also fail, because they require that the target documents are +already present. If this is not the case, the operations would run into +"document not found" errors. + +So what needs to be run instead are conditional inserts/updates/replaces, also +called _upserts_ or _repserts_. The behavior of such operations is: + +- Check if a document exists, based on some criteria +- If it does not exist, create the document +- If it exists, update or replace it with a new version + +ArangoDB provides the following options in AQL to achieve this: + +- `UPSERT` AQL operation +- `INSERT` AQL operation with `overwriteMode` +- Insert operation not using AQL, but the Document REST API + +These alternatives have different capabilities and performance characteristics. + +## `UPSERT` AQL Operation + +Let us start with the [`UPSERT` AQL operation](../high-level-operations/upsert.md), +which is very generic and flexible. + +The purpose of the `UPSERT` AQL operation is to ensure that a specific document +exists after the operation has finished. + +`UPSERT` will look for a specific document, based on user-configurable +attributes/values, and create the document if it does not yet exist. +If `UPSERT` finds such document, it can partially adjust it (`UPDATE`) or fully +replace it (`REPLACE`). + +To recap, the syntaxes of AQL `UPSERT` are, depending on whether you want to +update replace a document: + +```aql +UPSERT +INSERT +UPDATE +IN OPTIONS +``` + +or + +```aql +UPSERT +INSERT +REPLACE +IN OPTIONS +``` + +The `OPTIONS` part is optional. + +An example `UPSERT` operation looks like this: + +```aql +UPSERT { page: "index.html" } +INSERT { page: "index.html", status: "inserted" } +UPDATE { status: "updated" } +IN pages +``` + +This will look for a document in the `pages` collection with the `page` +attribute having a value of `index.html`. If such document cannot be found, the +`INSERT` part will be executed, which will create a document with the `page` and +`status` attributes. If the operation finds an existing document with `page` +being `index.html`, it will execute the `UPDATE` part, which will set the +document's `status` attribute to `updated`. + +### Tracking Modification Dates + +The `UPSERT` AQL operation is sometimes used in combination with +date/time-keeping. For example, the following query keeps track of when a +document was first created, and when it was last updated: + +```aql +UPSERT { page: "index.html" } +INSERT { page: "index.html", created: DATE_NOW() } +UPDATE { updated: DATE_NOW() } +IN pages +``` + +### `OLD` variable + +The `UPSERT` AQL operation also provides a pseudo-variable named `OLD` to refer +to the existing document and its values in the `UPDATE`/`REPLACE` part. +Following is an example that increments a counter on a document whenever the +`UPSERT` operation is executed: + +```aql +UPSERT { page: "index.html" } +INSERT { page: "index.html", hits: 1 } +UPDATE { hits: OLD.value + 1 } +IN pages +``` + +### `UPSERT` Caveats + +`UPSERT` is a very flexible operation, so some things should be kept in mind to +use it effectively and efficiently. + +#### Repeat the Search Attributes + +First of all, the `INSERT` part of an `UPSERT` operation should contain all +attributes that are used in the search expression. Consider the following +counter-example: + +```aql +UPSERT { page: "index.html" } +INSERT { status: "inserted" } /* page attribute missing here! */ +UPDATE { status: "updated" } +IN pages +``` + +Forgetting to specify the search attributes in the `INSERT` part introduces a +problem: The first time the `UPSERT` is executed and does not find a document +with `page` being `index.html`, it will branch into the `INSERT` part as +expected. However, the `INSERT` part will create a document with only the +`status` attribute set. The `page` attribute is missing here, so when the +`INSERT` completes, there is still no document with `page` being `index.html`. +That means whenever this `UPSERT` statement executes, it will branch into the +`INSERT` part, and the `UPDATE` part will never be reached. This is likely +unintentional. + +The problem can easily be avoided by adding the search attributes to the +`INSERT` part: + +```aql +UPSERT { page: "index.html" } +INSERT { page: "index.html", status: "inserted" } +UPDATE { status: "updated" } +IN pages +``` + +Note that it is not necessary to repeat the search attributes in the `UPDATE` +part, because `UPDATE` is a partial update. It will only set the attributes that +are specified in the `UPDATE` part, and leave all other existing attributes +alone. However, it is necessary to repeat the search attributes in the `REPLACE` +part, because `REPLACE` will completely overwrite the existing document with +what is specified in the `REPLACE` part. + +That means when using the `REPLACE` operation, the query should look like: + +```aql +UPSERT { page: "index.html" } +INSERT { page: "index.html", status: "inserted" } +REPLACE { page: "index.html", status: "updated" } +IN pages +``` + +#### Use Indexes for Search Attributes + +A downside of `UPSERT`'s flexibility is that it can be used on arbitrary +collection attributes, even if those are not indexed. + +When the `UPSERT` looks for an existing document, it _will_ use an index if an +index exists, but will also continue if no index exists. In the latter case, +the `UPSERT` will execute a full collection scan, which can be expensive for +large collections. So it is advised to create an index on the search +attribute(s) used in an `UPSERT`. + +#### `UPSERT` is Non-Atomic + +The overall `UPSERT` operation does not execute atomically for a single document. +It is basically a document lookup followed by either a document insert, update +or replace operation. + +That means if multiple `UPSERT` operations run concurrently with the same search +values, they may all determine that the target document does not exist - and +then all decide to create such document. That will mean one will end up with +multiple instances of the target document afterwards. + +To avoid such concurrency issues, a unique index can be created on the search +attribute(s). Such index will prevent concurrent `UPSERT` operations from +creating identical documents. Instead, only one of the concurrent `UPSERT`s will +succeed, whereas the others will fail with a "unique constraint violated" error. +In that case the client application can either retry the operation (which then +should go into the `UPDATE`/`REPLACE` branch), or ignore the error if the goal +was only to ensure the target document exists. + +Using a unique index on the search attribute(s) will thus improve lookup +performance and avoid duplicates. + +#### Using Shard Key(s) for Lookups + +In a cluster setup, the search expression should contain the shard key(s), as +this allows the lookup to be sent to a single shard only. This will be more +efficient than having to execute the lookup on all the shards of the collection. + +Another benefit of using the shard key(s) in the search expression is that +unique indexes are only supported if they contain the shard key(s). + +## `INSERT` AQL Operation with `overwriteMode` + +While the `UPSERT` AQL operation is very powerful and flexible, it is often not +the ideal choice for high-volume ingestion. + +A much more efficient alternative to the `UPSERT` AQL operation is the +[`INSERT` AQL operation](../high-level-operations/insert.md) with the `overwriteMode` +attribute set. This operation is not a drop-in replacement for `UPSERT`, but +rather a fast alternative in case the document key (`_key` attribute) is known +when the operation is executed, and none of the old values need to be referenced. + +The general syntax of the `INSERT` AQL operation is: + +```aql +INSERT +IN OPTIONS +``` + +As we will deal with the `overwriteMode` option here, we are focussing on +`INSERT` operations with this option set, for example: + +```aql +INSERT { _key: "index.html", status: "created" } +IN pages OPTIONS { overwriteMode: "ignore" } +``` + +Regardless of the selected `overwriteMode`, the `INSERT` operation will insert +the document if no document exists in the collection with the specified `_key`. +In this aspect it behaves as a regular `INSERT` operation. + +However, if a document with the specified `_key` already exists in the +collection, the `INSERT` behavior will be as follows, depending on the selected +`overwriteMode`: + +- `conflict` (default): if a document with the specified `_key` exists, return + a "unique constraint violation" +- `ignore`: if a document with the specified `_key` exists, do nothing. + Especially do not report a "unique constraint violation" error. +- `update`: if a document with the specified `_key` exists, (partially) update + the document with the attributes specified. +- `replace`: if a document with the specified `_key` exists, fully replace the + document with the attributes specified. + +If no `overwriteMode` is specified, the behavior of an `INSERT` operation is as +if the `overwriteMode` was set to `conflict`. + +The benefit of using `INSERT` with `overwriteMode` set to `ignore`, `update` or +`replace` is that the `INSERT` operation is going to be very fast, especially in +comparison with the `UPSERT` operation. In addition, `INSERT` will do a lookup +using the `_key` attribute, which is always indexed. So it will always use the +primary index and never do full collection scans. It also does not require +setting up additional indexes, because the primary index is automatically +present for all collections. + +There are also a few caveats when working with `INSERT` AQL operations: + +- They can only be used when the value of the `_key` attribute is known at the + time of insert. That means the client application must be able to provide the + document keys in a deterministic way. + +- The values that can be used for the `_key` attribute have some character and + length restrictions, but alphanumeric keys work well. + +- In a cluster setup, the underlying collection must be sharded by `_key`. This + is the default shard key, however. + +- There is no access to the data of an existing document for arbitrary + calculations when going into the `update` or `replace` mode. + +Please note that even though the `INSERT` AQL operation cannot refer to existing +documents to calculate values for updating/replacing, it can still return the +previous version of the document in case the document is already present. +This can be achieved by appending a `RETURN OLD` to the `INSERT` operation, +e.g. + +```aql +INSERT { _key: "index.html", status: "created" } +IN pages OPTIONS { overwriteMode: "replace" } +RETURN OLD +``` + +It is also possible to return the new version of the document (the inserted +document if no previous document existed, or the updated/replaced version in +case a document already existed) by using `RETURN NEW`: + +```aql +INSERT { _key: "index.html", status: "created" } +IN pages OPTIONS { overwriteMode: "replace" } +RETURN NEW +``` + +## Insert Operation not Using AQL + +There is the option to execute an insert operation with `overwriteMode` outside +of AQL. The [`POST /_api/document/{collection}`](../../develop/http-api/documents.md#create-multiple-documents) +endpoint is a dedicated REST API for insert operations, which can handle one +document, or multiple documents at once. + +Conceptually this API behaves like the `INSERT` AQL operation, but it can be +called with a batch of documents at once. This is the most efficient solution, +and should be preferred if possible. + +Most ArangoDB drivers also provide a means to insert multiple documents at once, +which will internally call this same REST API. + +The REST API provides the `returnOld` and `returnNew` options to make it return +the previous versions of documents or the insert/updated/replaced documents, in +the same way as the `INSERT` AQL operation can do. + +AQL `INSERT` queries with the `optimize-cluster-multiple-document-operations` +optimization applied perform similarly well in cluster deployments, but it +cannot be applied in all cases (see the list of +[optimizer rules](../execution-and-performance/query-optimization.md#optimize-cluster-multiple-document-operations) +for details). + +## Summary + +The `UPSERT` AQL operation is the most flexible way to conditionally insert or +update/replace documents in ArangoDB, but it is also the least efficient variant. + +The `INSERT` AQL operation with the `overwriteMode` set will outperform +`UPSERT`, but it can only be used for some use cases. + +Using the dedicated REST API for document inserts will be even more efficient, +and is thus the preferred option for bulk document inserts, but AQL `INSERT` +queries can be almost as fast. diff --git a/site/content/arangodb/oem/aql/execution-and-performance/_index.md b/site/content/arangodb/oem/aql/execution-and-performance/_index.md new file mode 100644 index 0000000000..305ecfedb8 --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/_index.md @@ -0,0 +1,7 @@ +--- +title: AQL Execution and Performance +menuTitle: Execution and Performance +weight: 50 +description: >- + This chapter describes AQL features related to query execution and query performance +--- diff --git a/site/content/arangodb/oem/aql/execution-and-performance/caching-query-results.md b/site/content/arangodb/oem/aql/execution-and-performance/caching-query-results.md new file mode 100644 index 0000000000..8e76741ee5 --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/caching-query-results.md @@ -0,0 +1,228 @@ +--- +title: The AQL query results cache +menuTitle: Caching query results +weight: 30 +description: >- + AQL provides an optional query results cache in single server deployments +--- +The purpose of the query results cache is to avoid repeated calculation of the same +query results. It is useful if data-reading queries repeat a lot and there are +not many write queries. + +The query results cache is transparent so users do not need to manually invalidate +results in it if underlying collection data are modified. + +{{< info >}} +The AQL query results cache is only available for single servers, i.e. servers that +are not part of a cluster setup. +{{< /info >}} + +## Modes + +The cache can be operated in the following modes: + +- `off`: The cache is disabled. No query results are stored. +- `on`: The cache stores the results of all AQL queries unless the `cache` + query option is set to `false`. +- `demand`: The cache stores the results of AQL queries that have the + `cache` query option set to `true` but ignores all others. + +The mode can be set at server startup as well as at runtime, see +[Global configuration](#global-configuration). + +## Query eligibility + +The query results cache considers two queries identical if they have exactly the +same query string and the same bind variables. Any deviation in terms of whitespace, +capitalization etc. is considered a difference. The query string is hashed +and used as the cache lookup key. If a query uses bind parameters, these are also +hashed and used as part of the cache lookup key. + +Even if the query strings of two queries are identical, the query results cache +treats them as different queries if they have different bind parameter +values. Other components that become part of a query's cache key are the +`count`, `fullCount`, and `optimizer` attributes. + +If the cache is enabled, it is checked whether it has a result ready for a +particular query at the very start of processing the query request. If this is +the case, the query result is served directly from the cache, which is normally +very efficient. If the query cannot be found in the cache, it is executed +as usual. + +If the query is eligible for caching and the cache is enabled, the query +result is stored in the query results cache so it can be used for subsequent +executions of the same query. + +A query is eligible for caching only if all of the following conditions are met: + +- The server the query executes on is a single server (i.e. not part of a cluster). +- The query is a read-only query and does not modify data in any collection. +- No warnings were produced while executing the query. +- The query is deterministic and only uses deterministic functions whose results + are marked as cacheable. +- The size of the query result does not exceed the cache's configured maximal + size for individual cache results or cumulated results. +- The query is not executed using a streaming cursor (`"stream": true` query option). + +The usage of non-deterministic functions leads to a query not being cacheable. +This is intentional to avoid caching of function results which should rather +be calculated on each invocation of the query (e.g. `RAND()` or `DATE_NOW()`). + +The query results cache considers all user-defined AQL functions to be non-deterministic +as it has no insight into these functions. + +## Cache invalidation + +The cached results are fully or partially invalidated automatically if +queries modify the data of collections that were used during the computation of +the cached query results. This is to protect users from getting stale results +from the query results cache. + +This also means that if the cache is turned on, then there is an additional +cache invalidation check for each data-modification operation (e.g. insert, update, +remove, truncate operations as well as AQL data-modification queries). + +**Example** + +If the result of the following query is present in the query results cache, +then either modifying data in the `users` or `organizations` collection +removes the already computed result from the cache: + +```aql +FOR user IN users + FOR organization IN organizations + FILTER user.organization == organization._key + RETURN { user: user, organization: organization } +``` + +Modifying data in other unrelated collections does not lead to this +query result being removed from the cache. + +## Performance considerations + +The query results cache is organized as a hash table, so looking up whether a query result +is present in the cache is fast. Still, the query string and the bind +parameter used in the query need to be hashed. This is a slight overhead that +is not present if the cache is disabled or a query is marked as not cacheable. + +Additionally, storing query results in the cache and fetching results from the +cache requires locking via a read/write lock. While many thread can read in parallel from +the cache, there can only be a single modifying thread at any given time. Modifications +of the query cache contents are required when a query result is stored in the cache +or during cache invalidation after data-modification operations. Cache invalidation +requires time proportional to the number of cached items that need to be invalidated. + +There may be workloads in which enabling the query results cache leads to a performance +degradation. It is not recommended to turn the query results cache on in workloads that only +modify data, or that modify data more often than reading it. Enabling the cache +also provides no benefit if queries are very diverse and do not repeat often. +In read-only or read-mostly workloads, the cache is beneficial if the same +queries are repeated lots of times. + +In general, the query results cache provides the biggest improvements for queries with +small result sets that take long to calculate. If query results are very big and +most of the query time is spent on copying the result from the cache to the client, +then the cache does not provide much benefit. + +## Global configuration + +The query results cache can be configured at server start with the +[`--query.cache-mode`](../../components/arangodb-server/options.md#--querycache-mode) +startup option. + +The cache mode can also be changed at runtime using the JavaScript API as follows: + +```js +require("@arangodb/aql/cache").properties({ mode: "on" }); +``` + +The maximum number of cached results in the cache for each database can be configured +at server start using the following configuration parameters: + +- `--query.cache-entries`: The maximum number of results in the query results cache per database +- `--query.cache-entries-max-size`: The maximum cumulated size of results in the query results cache per database +- `--query.cache-entry-max-size`: The maximum size of an individual result entry in query results cache +- `--query.cache-include-system-collections`: Whether to include system collection queries in the query results cache + +These parameters can be used to put an upper bound on the number and size of query +results in each database's query cache and thus restrict the cache's memory consumption. + +These value can also be adjusted at runtime as follows: + +```js +require("@arangodb/aql/cache").properties({ + maxResults: 200, + maxResultsSize: 8 * 1024 * 1024, + maxEntrySize: 1024 * 1024, + includeSystem: false +}); +``` + +The above settings limit the number of cached results in the query results cache to 200 +results per database, and to 8 MiB cumulated query result size per database. The maximum +size of each query cache entry is restricted to 1 MiB. Queries that involve system +collections are excluded from caching. + +You can also change the configuration at runtime with the +[HTTP API](../../develop/http-api/queries/aql-query-results-cache.md). + +## Per-query configuration + +When a query is sent to the server for execution and the cache is set to `on` or `demand`, +the query executor checks the query's `cache` option. If the query cache mode is +`on`, then not setting this query option or setting it to anything but `false` makes the +query executor consult the query results cache. If the query cache mode is `demand`, then setting +the `cache` option to `true` makes the executor look for the query in the query results cache. +When the query cache mode is `off`, the executor does not look for the query in the cache. + +The `cache` attribute can be set as follows via the `db._createStatement()` function: + +```js +var stmt = db._createStatement({ + query: "FOR doc IN users LIMIT 5 RETURN doc", + options: { + cache: true + } +}); + +stmt.execute(); +``` + +When using the `db._query()` function, the `cache` attribute can be set as follows: + +```js +db._query("FOR doc IN users LIMIT 5 RETURN doc", {}, { cache: true }); +``` + +You can also set the `cache` query option in the +[HTTP API](../../develop/http-api/queries/aql-queries.md#create-a-cursor). + +Each query result returned contain a `cached` attribute. It is set to `true` +if the result was retrieved from the query results cache, and `false` otherwise. Clients can use +this attribute to check if a specific query was served from the cache or not. + +## Query results cache inspection + +The contents of the query results cache can be checked at runtime using the cache's +`toArray()` function: + +```js +require("@arangodb/aql/cache").toArray(); +``` + +This returns a list of all query results stored in the current database's query +results cache. + +The query results cache for the current database can be cleared at runtime using the +cache's `clear` function: + +```js +require("@arangodb/aql/cache").clear(); +``` + +## Restrictions + +Query results that are returned from the query results cache may contain execution statistics +stemming from the initial, uncached query execution. This means for a cached query results, +the `extra.stats` attribute may contain stale data, especially in terms of the `executionTime` +and `profile` attribute values. diff --git a/site/content/arangodb/oem/aql/execution-and-performance/explaining-queries.md b/site/content/arangodb/oem/aql/execution-and-performance/explaining-queries.md new file mode 100644 index 0000000000..b51e33989d --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/explaining-queries.md @@ -0,0 +1,278 @@ +--- +title: Explain AQL Queries +menuTitle: Explaining queries +weight: 15 +description: >- + You can explain and profile AQL queries to inspect the execution plans and to + understand the performance characteristics, as well as create debug packages + for reporting issues +# Undocumented on purpose: +# require("@arangodb/aql/explainer").explainRegisters(data, options, shouldPrint); +# require("@arangodb/aql/explainer").debug(query, bindVars, options); +--- +If it is unclear how a given query will perform, clients can retrieve a query's execution plan +from the AQL query optimizer without actually executing the query. Getting the query execution +plan from the optimizer is called *explaining*. + +An explain throws an error if the given query is syntactically invalid. Otherwise, it +returns the execution plan and some information about what optimizations could be applied to +the query. The query is not executed. + +You can explain a query using the [HTTP REST API](../../develop/http-api/queries/aql-queries.md#explain-an-aql-query) +or via _arangosh_. + +## Inspecting query plans + +The `explain()` method of an `ArangoStatement` (`db._createStatement(...).explain()`) +creates very verbose output. To get a human-readable output of the query plan, +you can use `db._explain()`. You can use it as follows (without disabling syntax +highlighting with `{ colors: false }`): + +```js +--- +name: 01_workWithAQL_databaseExplain +description: '' +--- +db._explain("LET s = SLEEP(0.25) LET t = SLEEP(0.5) RETURN 1", {}, {colors: false}); +``` + +The plan contains all execution nodes that are used during a query. These nodes represent different +stages in a query. Each stage gets the input from the stage directly above (its dependencies). +The plan shows you the estimated number of items (results) for each query stage (under **Est.**). Each +query stage roughly equates to a line in your original query, which you can see under **Comment**. + +## Profiling queries + +Sometimes when you have a complex query it can be unclear on what time is spent +during the execution, even for intermediate ArangoDB users. + +By profiling a query it gets executed with special instrumentation code enabled. +It gives you all the usual information like when explaining a query, but +additionally you get the query profile, [runtime statistics](query-statistics.md) +and per-node statistics. + +To use this in an interactive fashion in the shell, you can call +`db._profileQuery()`, or use the web interface. You can use `db._profileQuery()` +as follows (without disabling syntax highlighting with `{ colors: false }`): + +```js +--- +name: 01_workWithAQL_databaseProfileQuery +description: '' +--- +db._profileQuery("LET s = SLEEP(0.25) LET t = SLEEP(0.5) RETURN 1", {}, {colors: false}); +``` + +For more information, see [Profiling Queries](query-profiling.md). + +## Execution plans in detail + +By default, the query optimizer returns what it considers to be the *optimal plan*. The +optimal plan is returned in the `plan` attribute of the result. If `explain` is +called with the `allPlans` option set to `true`, all plans are returned in the `plans` +attribute. + +The result object also contains a `warnings` attribute, which is an array of +warnings that occurred during optimization or execution plan creation. + +Each plan in the result is an object with the following attributes: +- `nodes`: the array of execution nodes of the plan. See the list of + [execution nodes](query-optimization.md#execution-nodes) +- `estimatedCost`: the total estimated cost for the plan. If there are multiple + plans, the optimizer chooses the plan with the lowest total cost. +- `collections`: an array of collections used in the query +- `rules`: an array of rules the optimizer applied. See the list of + [optimizer rules](query-optimization.md#optimizer-rules) +- `variables`: array of variables used in the query (note: this may contain + internal variables created by the optimizer) + +Here is an example for retrieving the execution plan of a simple query: + +```js +--- +name: 07_workWithAQL_statementsExplain +description: '' +--- +var stmt = db._createStatement("FOR user IN _users RETURN user"); +stmt.explain(); +``` + +As the output of `explain()` is very detailed, it is recommended to use some +scripting to make the output less verbose: + +```js +--- +name: 08_workWithAQL_statementsPlans +description: '' +--- +var formatPlan = function (plan) { + return { estimatedCost: plan.estimatedCost, + nodes: plan.nodes.map(function(node) { + return node.type; }) }; }; +formatPlan(stmt.explain().plan); +``` + +If a query contains bind parameters, they must be added to the statement **before** +`explain()` is called: + +```js +--- +name: 09_workWithAQL_statementsPlansBind +description: '' +--- +var stmt = db._createStatement( + `FOR doc IN @@collection FILTER doc.user == @user RETURN doc` +); +stmt.bind({ "@collection" : "_users", "user" : "root" }); +stmt.explain(); +``` + +In some cases, the AQL optimizer creates multiple plans for a single query. By default +only the plan with the lowest total estimated cost is kept, and the other plans are +discarded. To retrieve all plans the optimizer has generated, `explain` can be called +with the option `allPlans` set to `true`. + +In the following example, the optimizer has created two plans: + +```js +--- +name: 10_workWithAQL_statementsPlansOptimizer0 +description: '' +--- +var stmt = db._createStatement( + "FOR user IN _users FILTER user.user == 'root' RETURN user"); +stmt.explain({ allPlans: true }).plans.length; +``` + +To see a slightly more compact version of the plan, the following +transformation can be applied: + +```js +--- +name: 10_workWithAQL_statementsPlansOptimizer1 +description: '' +--- +~var stmt = db._createStatement("FOR user IN _users FILTER user.user == 'root' RETURN user"); +stmt.explain({ allPlans: true }).plans.map( + function(plan) { return formatPlan(plan); }); +``` + +`explain()` also accepts the following additional options: +- `maxPlans`: limits the maximum number of plans that are created by the AQL query optimizer +- `optimizer`: + - `rules`: an array of to-be-included or to-be-excluded optimizer rules + can be put into this attribute, telling the optimizer to include or exclude + specific rules. To disable a rule, prefix its name with a `-`, to enable a rule, prefix it + with a `+`. There is also a pseudo-rule `all`, which matches all optimizer rules. + +The following example disables all optimizer rules but `remove-redundant-calculations`: + +```js +--- +name: 10_workWithAQL_statementsPlansOptimizer2 +description: '' +--- +~var stmt = db._createStatement("FOR user IN _users FILTER user.user == 'root' RETURN user"); +stmt.explain({ optimizer: { + rules: [ "-all", "+remove-redundant-calculations" ] } }); +``` + +The contents of an execution plan are meant to be machine-readable. To get a human-readable +version of a query's execution plan, the following commands can be used +(without disabling syntax highlighting with `{ colors: false }`): + +```js +--- +name: 10_workWithAQL_statementsPlansOptimizer3 +description: '' +--- +var query = "FOR doc IN mycollection FILTER doc.value > 42 RETURN doc"; +require("@arangodb/aql/explainer").explain(query, {colors:false}); +``` + +The above command prints the query's execution plan in the ArangoShell +directly, focusing on the most important information. + +## Gathering debug information about a query + +If an explain provides no suitable insight into why a query does not perform as +expected, it may be reported to the ArangoDB support. In order to make this as easy +as possible, there is a built-in command in ArangoShell for packaging the query, its +bind parameters, and all data required to execute the query elsewhere. + +`require("@arangodb/aql/explainer").debugDump(filepath, query[, bindVars[, options]])` + +You can specify the following parameters: + +- `filepath` (string): A file path to save the debug package to +- `query` (string): An AQL query +- `bindVars` (object, _optional_): The bind parameters for the query +- `options` (object, _optional_): Options for the query, with two additionally + supported settings compared to `db._query()`: + - `examples` (number, _optional_): How many sample documents of your + collection data to include. Default: `0` + - `anonymize` (boolean, _optional_): Whether all string attribute values of + the sample documents shall be substituted with strings like `XXX`. + +The command stores all data in a file with a configurable filename: + +```js +--- +name: 10_workWithAQL_debugging1 +description: '' +--- +var query = "FOR doc IN mycollection FILTER doc.value > 42 RETURN doc"; +require("@arangodb/aql/explainer").debugDump("/tmp/query-debug-info", query); +``` + +Entitled users can send the generated file to the Arango support to facilitate +reproduction and debugging. + +{{< tip >}} +You can also create debug packages using the web interface, see +[Query debug packages](../../operations/troubleshooting/query-debug-packages.md). +{{< /tip >}} + +If a query contains bind parameters, you need to specify them along with the query +string: + +```js +--- +name: 10_workWithAQL_debugging2 +description: '' +--- +var query = "FOR doc IN @@collection FILTER doc.value > @value RETURN doc"; +var bindVars = { value: 42, "@collection": "mycollection" }; +require("@arangodb/aql/explainer").debugDump("/tmp/query-debug-info", query, bindVars); +``` + +It is also possible to include example documents from the underlying collection in +order to make reproduction even easier. Example documents can be sent as they are, or +in an anonymized form. The number of example documents can be specified in the `examples` +options attribute, and should generally be kept low. The `anonymize` option replaces +the contents of string attributes in the examples with `XXX`. However, it does not +replace any other types of data (e.g. numeric values) or attribute names. Attribute +names in the examples are always preserved because they may be indexed and used in +queries: + +```js +--- +name: 10_workWithAQL_debugging3 +description: '' +--- +var query = "FOR doc IN @@collection FILTER doc.value > @value RETURN doc"; +var bind = { value: 42, "@collection": "mycollection" }; +var options = { examples: 10, anonymize: true }; +require("@arangodb/aql/explainer").debugDump("/tmp/query-debug-info", query, bind, options); +``` + +To get a human-readable output from a debug package JSON file, you can use the +`inspectDump()` method: + +`require("@arangodb/aql/explainer").inspectDump(inFilepath[, outFilepath])` + +You can specify the following parameters: + +- `inFilepath` (string): The path to the debug package JSON file +- `outFilepath` (string, _optional_): A path to store the formatted output in a + file instead of printing to the shell diff --git a/site/content/arangodb/oem/aql/execution-and-performance/parsing-queries.md b/site/content/arangodb/oem/aql/execution-and-performance/parsing-queries.md new file mode 100644 index 0000000000..8c87fab393 --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/parsing-queries.md @@ -0,0 +1,32 @@ +--- +title: Parsing AQL queries +menuTitle: Parsing queries +weight: 10 +description: >- + Clients can check if given AQL queries are syntactically valid using an + HTTP API or JavaScript API +--- +ArangoDB provides an [HTTP REST API](../../develop/http-api/queries/aql-queries.md) +for parsing and thus statically validating queries. + +A query can also be parsed from the ArangoShell using `ArangoStatement`'s `parse` method. The +`parse` method will throw an exception if the query is syntactically invalid. Otherwise, it will +return the some information about the query. + +The return value is an object with the collection names used in the query listed in the +`collections` attribute, and all bind parameters listed in the `bindVars` attribute. +Additionally, the internal representation of the query, the query's abstract syntax tree, will +be returned in the `AST` attribute of the result. Please note that the abstract syntax tree +will be returned without any optimizations applied to it. + +```js +--- +name: 11_workWithAQL_parseQueries +description: '' +--- +var stmt = db._createStatement( + "FOR doc IN @@collection FILTER doc.foo == @bar RETURN doc"); +stmt.parse(); +~removeIgnoreCollection("mycollection") +~db._drop("mycollection") +``` diff --git a/site/content/arangodb/oem/aql/execution-and-performance/query-optimization.md b/site/content/arangodb/oem/aql/execution-and-performance/query-optimization.md new file mode 100644 index 0000000000..919543e71e --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/query-optimization.md @@ -0,0 +1,626 @@ +--- +title: The AQL query optimizer +menuTitle: Query Optimization +weight: 25 +description: >- + AQL queries are sent through an optimizer before execution that creates an + initial execution plan, looks for optimization opportunities, and applies them +pageToc: + maxHeadlineLevel: 3 +--- +AQL queries are parsed and planned. The optimizer might produce multiple execution plans +for a single query. It then calculates the costs for all plans and picks the plan with the +lowest total cost. This resulting plan is considered to be the *optimal plan*, which is +then executed. + +The optimizer is designed to only perform optimizations if they are *safe*, in the +sense that an optimization should not modify the result of a query. A notable exception +to this is that the optimizer is allowed to change the order of results for queries that +do not explicitly specify how results should be sorted. + +## Execution plans + +The `explain` command can be used to query the optimal executed plan or even all plans +the optimizer has generated. Additionally, `explain` can reveal some more information +about the optimizer's view of the query. + +### Inspecting plans using the explain helper + +The `explain` method of `ArangoStatement` as shown in the next chapters creates very verbose output. +You can work on the output programmatically, or use this handsome tool that we created +to generate a more human readable representation. + +You may use it like this: (we disable syntax highlighting here) + +```js +--- +name: AQLEXP_01_axplainer +description: '' +--- +~addIgnoreCollection("test") +~db._drop("test"); +var coll = db._create("test"); +for (i = 0; i < 100; ++i) { db.test.save({ value: i }); } +var idx = db.test.ensureIndex({ type: "persistent", fields: [ "value" ] }); +var explain = require("@arangodb/aql/explainer").explain; +explain("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value", {colors:false}); +``` + +### Execution plans in detail + +Let's have a look at the raw json output of the same execution plan +using the `explain` method of `ArangoStatement`: + +```js +--- +name: AQLEXP_01_explainCreate +description: '' +--- +var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain(); +``` + +As you can see, the result details are very verbose. They are not covered in +detail for brevity in the next sections. Instead, let's take a closer look at +the results step by step. + +#### Execution nodes of a query + +In general, an execution plan can be considered to be a pipeline of processing steps. +Each processing step is carried out by a so-called *execution node* + +The `nodes` attribute of the `explain` result contains these *execution nodes* in +the *execution plan*. The output is still very verbose, so here's a shorted form of it: + +```js +--- +name: AQLEXP_02_explainOverview +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain().plan.nodes.map(function (node) { return node.type; }); +``` + +Note that the list of nodes might slightly change in future versions of ArangoDB if +new execution node types get added or the optimizer create somewhat more +optimized plans. + +When a plan is executed, the query execution engine starts with the node at +the bottom of the list (i.e. the `ReturnNode`). + +The `ReturnNode`'s purpose is to return data to the caller. It does not produce +data itself, but it asks the node above itself, which is the `CalculationNode` +in our example. +`CalculationNode`s are responsible for evaluating arbitrary expressions. In our +example query, the `CalculationNode` evaluates the value of `i.value`, which +is needed by the `ReturnNode`. The calculation is applied for all data the +`CalculationNode` gets from the node above it, in our example the `IndexNode`. + +Finally, all of this needs to be done for documents of collection `test`. This is +where the `IndexNode` enters the game. It uses an index (thus its name) +to find certain documents in the collection and ships it down the pipeline in the +order required by `SORT i.value`. The `IndexNode` itself has a `SingletonNode` +as its input. The sole purpose of a `SingletonNode` node is to provide a single empty +document as input for other processing steps. It is always the end of the pipeline. + +Here is a summary: +- SingletonNode: produces an empty document as input for other processing steps. +- IndexNode: iterates over the index on attribute `value` in collection `test` + in the order required by `SORT i.value`. +- CalculationNode: evaluates the result of the calculation `i.value > 97` to `true` or `false` +- CalculationNode: calculates return value `i.value` +- ReturnNode: returns data to the caller + +#### Optimizer rules used for a query + +Note that in the example, the optimizer has optimized the `SORT` statement away. +It can do it safely because there is a sorted persistent index on `i.value`, which it has +picked in the `IndexNode`. As the index values are iterated over in sorted order +anyway, the extra `SortNode` would have been redundant and was removed. + +Additionally, the optimizer has done more work to generate an execution plan that +avoids as much expensive operations as possible. Here is the list of optimizer rules +that were applied to the plan: + +```js +--- +name: AQLEXP_03_explainRules +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain().plan.rules; +``` + +Here is the meaning of these rules in context of this query: +- `move-calculations-up`: Moves a `CalculationNode` and subqueries, when independent from the outer node, + as far up in the processing pipeline as possible. +- `move-filters-up`: Moves a `FilterNode` as far up in the processing pipeline as + possible. +- `remove-redundant-calculations`: Replaces references to variables with references to + other variables that contain the exact same result. In the example query, `i.value` + is calculated multiple times, but each calculation inside a loop iteration would + produce the same value. Therefore, the expression result is shared by several nodes. +- `remove-unnecessary-calculations`: Removes `CalculationNode`s whose result values are + not used in the query. In the example this happens due to the `remove-redundant-calculations` + rule having made some calculations unnecessary. +- `use-indexes`: Use an index to iterate over a collection instead of performing a + full collection scan. In the example case this makes sense, as the index can be + used for filtering and sorting. +- `remove-filter-covered-by-index`: Remove an unnecessary filter whose functionality + is already covered by an index. In this case the index only returns documents + matching the filter. +- `use-index-for-sort`: Removes a `SORT` operation if it is already satisfied by + traversing over a sorted index. + +Note that some rules may appear multiple times in the list, with number suffixes. +This is due to the same rule being applied multiple times, at different positions +in the optimizer pipeline. + +Also see the full list of [optimizer rules](#optimizer-rules) below. + +#### Collections used in a query + +The list of collections used in a plan (and query) is contained in the `collections` +attribute of a plan: + +```js +--- +name: AQLEXP_04_explainCollections +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain().plan.collections +``` + +The `name` attribute contains the name of the `collection`, and `type` is the +access type, which can be either `read` or `write`. + +#### Variables used in a query + +The optimizer returns a list of variables used in a plan (and query). This +list contains auxiliary variables created by the optimizer itself. You can +ignore this list in most cases. + +#### Cost of a query + +For each plan the optimizer generates, it calculates the total cost. The plan +with the lowest total cost is considered to be the optimal plan. Costs are +estimates only, as the actual execution costs are unknown to the optimizer. +Costs are calculated based on heuristics that are hard-coded into execution nodes. +Cost values do not have any unit. + +### Retrieving all execution plans + +To retrieve not just the optimal plan but a list of all plans the optimizer has +generated, set the option `allPlans` to `true`: + +This returns a list of all plans in the `plans` attribute instead of in the +`plan` attribute: + +```js +--- +name: AQLEXP_05_explainAllPlans +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain({ allPlans: true }); +``` + +### Retrieving the plan as it was generated by the parser / lexer + +To retrieve the plan which closely matches your query, you may turn off most +optimization rules (i.e. cluster rules cannot be disabled if you're running +the explain on a cluster Coordinator) set the option `rules` to `-all`: + +This returns an unoptimized plan in the `plan`: + +```js +--- +name: AQLEXP_06_explainUnoptimizedPlans +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain({ optimizer: { rules: [ "-all" ] } }); +``` + +Note that some optimizations are already done at parse time (i.e. evaluate simple constant +calculation as `1 + 1`) + +## Turning specific optimizer rules off + +Optimizer rules can also be turned on or off individually, using the `rules` attribute. +This can be used to enable or disable one or multiple rules. Rules that shall be enabled +need to be prefixed with a `+`, rules to be disabled should be prefixed with a `-`. The +pseudo-rule `all` matches all rules. + +Rules specified in `rules` are evaluated from left to right, so the following works to +turn on just the one specific rule: + +```js +--- +name: AQLEXP_07_explainSingleRulePlans +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain({ optimizer: { rules: [ "-all", "+use-index-range" ] } }); +``` + +By default, all rules are turned on. To turn off just a few specific rules, use something +like this: + +```js +--- +name: AQLEXP_08_explainDisableSingleRulePlans +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain({ optimizer: { rules: [ "-use-index-range", "-use-index-for-sort" ] } }); +``` + +The maximum number of plans created by the optimizer can also be limited using the +`maxNumberOfPlans` attribute: + +```js +--- +name: AQLEXP_09_explainMaxNumberOfPlans +description: '' +--- +~var stmt = db._createStatement("FOR i IN test FILTER i.value > 97 SORT i.value RETURN i.value"); +stmt.explain({ maxNumberOfPlans: 1 }); +``` + +## Optimizer statistics + +The optimizer provides statistics as a part of an `explain` result. +The following attributes are returned in the `stats` attribute: + +- `plansCreated`: The total number of plans created by the optimizer. +- `rulesExecuted`: The number of rules executed. Note that an executed rule does + not indicate that a plan has actually been modified by a rule. +- `rulesSkipped`: The number of rules skipped by the optimizer. +- `executionTime`: The (wall-clock) time in seconds needed to explain the query. +- `peakMemoryUsage`: The maximum memory usage of the query during explain. + +## Warnings + +For some queries, the optimizer may produce warnings. These are returned in +the `warnings` attribute of the `explain` result: + +```js +--- +name: AQLEXP_10_explainWarn +description: '' +--- +var stmt = db._createStatement("FOR i IN 1..10 RETURN 1 / 0") +stmt.explain().warnings; +~db._drop("test") +~removeIgnoreCollection("test") +``` + +There is an upper bound on the number of warnings a query may produce. If that +bound is reached, no further warnings are returned. + +## Optimization in a cluster + +When you are running AQL in the cluster, the parsing of the query is done on the +Coordinator. The Coordinator then chops the query into snippets, which are either +to remain on the Coordinator or need to be distributed to the shards on the +DB-Servers over the network. The cutting sites are interconnected via `ScatterNode`s, +`GatherNode`s and `RemoteNode`s. These nodes mark the network borders of the snippets. + +The optimizer strives to reduce the amount of data transferred via these network +interfaces by pushing `FILTER`s out to the shards, as it is vital to the query +performance to reduce that data amount to transfer over the network links. + +{{< info >}} +Some hops between Coordinators and DB-Servers are unavoidable. An example are +[user-defined functions](../user-defined-functions.md) (UDFs), which have to be executed on +the Coordinator. If you cannot modify your query to have a lower amount of +back and forth between sites, then try to lower the amount of data that has +to be transferred between them. In case of UDFs, use effective FILTERs before +calling them. +{{< /info >}} + +Using a cluster, there is a **Site** column if you explain a query. +Snippets marked with **DBS** are executed on DB-Servers, **COOR** ones are +executed on the respective Coordinator. + +```aql +Query String (57 chars, cacheable: false): + FOR doc IN test UPDATE doc WITH { updated: true } IN test + +Execution plan: + Id NodeType Site Est. Comment + 1 SingletonNode DBS 1 * ROOT + 3 CalculationNode DBS 1 - LET #3 = { "updated" : true } + 13 IndexNode DBS 1000000 - FOR doc IN test /* primary index scan, index only, projections: `_key`, 5 shard(s) */ + 4 UpdateNode DBS 0 - UPDATE doc WITH #3 IN test + 7 RemoteNode COOR 0 - REMOTE + 8 GatherNode COOR 0 - GATHER +``` + +## Execution nodes + +### List of execution nodes + +The following execution node types appear in the output of `explain`: + +- **CalculationNode**: + Evaluates an expression. The expression result may be used by + other nodes, e.g. `FilterNode`, `EnumerateListNode`, `SortNode` etc. + +- **CollectNode**: + Aggregates its input and produces new output variables. This appears + once per `COLLECT` statement. + +- **EnumerateCollectionNode**: + Enumeration over documents of a collection (given in its *collection* + attribute) without using an index. + +- **EnumerateListNode**: + Enumeration over a list of (non-collection) values. + +- **EnumerateViewNode**: + Enumeration over documents of a View. + +- **FilterNode**: + Only lets values pass that satisfy a filter condition. Appears once + per `FILTER` statement. + +- **IndexNode**: + Enumeration over one or many indexes (given in its *indexes* attribute) + of a collection. The index ranges are specified in the *condition* attribute + of the node. + +- **InsertNode**: + Inserts documents into a collection (given in its *collection* attribute). + Appears exactly once in a query that contains an *INSERT* statement. + +- **KShortestPathsNode**: + Indicates a traversal for k Shortest Paths (`K_SHORTEST_PATHS` in AQL). + +- **KPathsNode**: + Indicates a traversal for k Paths (`K_PATHS` in AQL). + +- **LimitNode**: + Limits the number of results passed to other processing steps. Appears + once per `LIMIT` statement. + +- **MaterializeNode**: + The presence of this node means that the query is not fully covered by + indexes and therefore needs to involve the storage engine. + +- **RemoveNode**: + Removes documents from a collection (given in its *collection* attribute). + Appears exactly once in a query that contains a `REMOVE` statement. + +- **ReplaceNode**: + Replaces documents in a collection (given in its *collection* attribute). + Appears exactly once in a query that contains a `REPLACE` statement. + +- **ReturnNode**: + Returns data to the caller. Appears in each read-only query at + least once. Subqueries also contain `ReturnNode`s. + +- **SingletonNode**: + The purpose of a `SingletonNode` is to produce an empty document that is + used as input for other processing steps. Each execution plan contains + exactly one `SingletonNode` as its top node. + +- **ShortestPathNode**: + Indicates a traversal for a Shortest Path (`SHORTEST_PATH` in AQL). + +- **SortNode**: + Performs a sort of its input values. + +- **SubqueryEndNode**: + End of a spliced (inlined) subquery. + +- **SubqueryNode**: + Executes a subquery. + +- **SubqueryStartNode**: + Beginning of a spliced (inlined) subquery. + +- **TraversalNode**: + Indicates a regular graph traversal, as opposed to a shortest path(s) + traversal. + +- **UpdateNode**: + Updates documents in a collection (given in its *collection* attribute). + Appears exactly once in a query that contains an `UPDATE` statement. + +- **UpsertNode**: + Upserts documents in a collection (given in its *collection* attribute). + Appears exactly once in a query that contains an `UPSERT` statement. + +### List of cluster execution nodes + +For queries in the cluster, the following additional nodes may appear in +execution plans: + +- **DistributeNode**: + Used on a Coordinator to fan-out data to one or multiple shards, + taking into account a collection's shard key. + +- **GatherNode**: + Used on a Coordinator to aggregate results from one or many shards + into a combined stream of results. Parallelizes work for certain types + of queries when there are multiple DB-Servers involved + (shown as `GATHER /* parallel */` in query explain). + +- **RemoteNode**: + A `RemoteNode` performs communication with another ArangoDB instances + in the cluster. For example, the cluster Coordinator needs to communicate + with other servers to fetch the actual data from the shards. It does so + via `RemoteNode`s. The data servers themselves might again pull further data + from the Coordinator, and thus might also employ `RemoteNode`s. So, all of + the above cluster relevant nodes are accompanied by a `RemoteNode`. + +- **ScatterNode**: + Used on a Coordinator to fan-out data to one or multiple shards. + +- **SingleRemoteOperationNode**: + Used on a Coordinator to directly work with a single + document on a DB-Server that is referenced by its `_key`. + +- **MultipleRemoteExecutionNode**: + Used to optimize bulk `INSERT` operations in cluster deployments, reducing the + setup and shutdown overhead and the number of internal network requests. + +## Optimizer rules + +### List of optimizer rules + +The following user-facing optimizer rules exist and are enabled by default +unless noted otherwise. You can +[enable and disable optimizer rules](#turning-specific-optimizer-rules-off) +except for a few required rules. + +Some rules exist multiple times with number suffixes like `-2`, +(e.g. `remove-unnecessary-calculations-2`). This is due to the same rule being +applied multiple times at different optimization stages. + +{{% comment %}} Execute code but exclude its output from rendering + +```js +--- +name: 00_dumpOptimizerRules +description: '' +type: cluster +--- +var url = "/_api/query/rules"; +var rules = internal.arango.GET(url); +assert(Array.isArray(rules)); +assert(rules.some(e => e.flags && e.flags.clusterOnly)); +var outfile = "Documentation/optimizer-rules.json"; +assert(fs.write(outfile, JSON.stringify(rules, undefined, 2))); +``` + +{{% /comment %}} + +{{% optimizer-rules %}} + +### Additional optimizations applied + +#### Scan-Only Optimization + +If a query iterates over a collection (for filtering or counting) but does not need +the actual document values later, the optimizer can apply a "scan-only" optimization +for `EnumerateCollectionNode` and `IndexNode` node types. In this case, it does not build up +a result with the document data at all, which may reduce work significantly. +In case the document data is actually not needed later on, it may be sensible to remove +it from query strings so the optimizer can apply the optimization. + +If the optimization is applied, it shows up as `scan only` in an AQL +query's execution plan for an `EnumerateCollectionNode` or an `IndexNode`. + +#### Index-Only Optimization + +The optimizer can apply an "index-only" optimization for AQL queries that +can satisfy the retrieval of all required document attributes directly from an index. + +This optimization is triggered if an index is used +that covers all required attributes of the document used later on in the query. +If applied, it saves retrieving the actual document data (which would require +an extra lookup by the storage engine), but instead builds the document data solely +from the index values found. It only applies when using up to 5 (or +[`maxProjections`](../high-level-operations/for.md#maxprojections)) attributes +from the document, and only if the rest of the document data is not used later +on in the query. + +The optimization is available for the following index types: `primary`, +`edge`, and `persistent`. + +If the optimization is applied, it shows up as `index only` in an AQL +query's execution plan for an `IndexNode`. + +#### Filter Projections Optimizations + +Introduced: v3.10.0 + +If an index is used that does not cover all required attributes for the query, +but if it is followed by filter conditions that only access attributes that are +part of the index, then an optimization is applied, to only fetch matching +documents. "Part of the index" here means, that all attributes referred to in +the post-filter conditions are contained in the `fields` or `storedValues` +attributes of the index definition. + +For example, the optimization is applied in the following case: +- There is a persistent index on the attributes `[ "value1", "value2" ]` + (in this order), or there is a persistent index on just `["value1"]` and + with a `storedValues` definition of `["value2"]`. +- There is a filter condition on `value1` that can use the index, and a filter + condition on `value2` that cannot use the index (post-filter condition). + +Example query: + +```aql +FOR doc IN collection + FILTER doc.value1 == @value1 /* uses the index */ + FILTER ABS(doc.value2) != @value2 /* does not use the index */ + RETURN doc +``` + +This query's execution plan looks as follows: + +```aql +Execution plan: + Id NodeType Est. Comment + 1 SingletonNode 1 * ROOT + 8 IndexNode 0 - FOR doc IN collection /* persistent index scan (filter projections: `value2`) */ FILTER (ABS(doc.`value2`) != 2) /* early pruning */ + 7 ReturnNode 0 - RETURN doc + +Indexes used: + By Name Type Collection Unique Sparse Cache Selectivity Fields Ranges + 8 idx_1737498319258648576 persistent collection false false false 99.96 % [ `value1`, `value2` ] (doc.`value1` == 1) +``` + +The first filter condition is transformed to an index lookup, as you can tell +from the `persistent index scan` comment and the `Indexes used` section that +shows the range `` doc.`value` == 1 ``. The post-filter condition +`FILTER ABS(doc.value2) != 2` can be recognized as such by the `early pruning` +comment that follows it. + +The `filter projections` mentioned in the above execution plan is an indicator +of the optimization being triggered. + +Instead of fetching the full documents from the storage engine for all index +entries that matched the index lookup condition, only those that also satisfy +the index lookup post-filter condition are fetched. +If the post-filter condition filters out a lot of documents, this optimization +can significantly speed up queries that produce large result sets from index +lookups but filter many of the documents away with post-filter conditions. + +Note that the optimization can also be combined with regular projections, e.g. +for the following query that returns a specific attribute from the documents +only: + +```aql +FOR doc IN collection + FILTER doc.value1 == @value1 /* uses the index */ + FILTER ABS(doc.value2) != @value2 /* does not use the index */ + RETURN doc.value3 +``` + +That query's execution plan combines projections from the index for the +post-filter condition (`filter projections`) as well as regular projections +(`projections`) for the processing parts of the query that follow the +post-filter condition: + +```aql +Execution plan: + Id NodeType Est. Comment + 1 SingletonNode 1 * ROOT + 9 IndexNode 5000 - FOR doc IN collection /* persistent index scan (filter projections: `value2`) (projections: `value3`) */ FILTER (ABS(doc.`value2`) != 2) /* early pruning */ + 7 CalculationNode 5000 - LET #5 = doc.`value3` /* attribute expression */ /* collections used: doc : collection */ + 8 ReturnNode 5000 - RETURN #5 + +Indexes used: + By Name Type Collection Unique Sparse Cache Selectivity Fields Ranges + 9 idx_1737498319258648576 persistent collection false false false 99.96 % [ `value1`, `value2` ] (doc.`value1` == 1) +``` + +The optimization is most effective for queries in which many documents would +be selected by the index lookup condition, but many are filtered out by the +post-filter condition. diff --git a/site/content/arangodb/oem/aql/execution-and-performance/query-profiling.md b/site/content/arangodb/oem/aql/execution-and-performance/query-profiling.md new file mode 100644 index 0000000000..2f28da6760 --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/query-profiling.md @@ -0,0 +1,229 @@ +--- +title: Profiling and Hand-Optimizing AQL queries +menuTitle: Query Profiling +weight: 20 +description: >- + For understanding the performance of specific queries, you can profile them to + identify slow parts of query execution plans +--- +ArangoDB allows to execute your query with special instrumentation code enabled. +It provides you a query plan with detailed execution statistics. + +To use this in an interactive fashion on the shell you can use +`db._profileQuery(..)` in _arangosh_. Alternatively, there is a button +_Profile_ in the Query tab of the web interface. + +The printed execution plan then contains three additional columns: + +- **Call**: The number of times this query stage was executed +- **Items**: The number of temporary result rows (outputs) at this stage +- **Filtered**: The number of rows filtered away by this stage +- **Runtime**: The total time spent in this stage + +Below the execution plan there are additional sections for the overall runtime +statistics and the query profile. + +## Example: Simple AQL query + +Assuming we got a collection named `acollection` and insert 10000 documents +via `for (let i=0; i < 10000;i++) db.acollection.insert({value:i})`. +Then a simple query filtering for `value < 10` will return 10 results: + +```js +--- +name: 01_workWithAQL_profileQuerySimple +description: '' +--- +~db._drop("acollection"); +~db._create('acollection'); +~for (let i=0; i < 10000; i++) { db.acollection.insert({value:i}); } +db._profileQuery(` + FOR doc IN acollection + FILTER doc.value < 10 + RETURN doc`, {}, {colors: false} +); +~db._drop("acollection"); +``` + +An AQL query is essentially executed in a pipeline that chains together different +functional execution blocks. Each block gets the input rows from the parent above +it, does some processing and then outputs a certain number of output rows. + +Without any detailed insight into the query execution it is impossible to tell +how many results each pipeline-block had to work on and how long this took. +By executing the query with the query profiler (`db._profileQuery()` or via +the _Profile_ button in the web interface) you can check exactly how much work +each stage had to do. + +Without any indexes this query should have to perform the following operations: + +1. Perform a full collection scan via a _EnumerateCollectionNode_ and outputting + a row containing the document in `doc`. +2. Calculate the boolean expression `LET #1 = doc.value < 10` from all inputs + via a _CalculationNode_ +3. Filter out all input rows where `#1` is false via the _FilterNode_ +4. Put the `doc` variable of the remaining rows into the result set via + the _ResultNode_ + +The _EnumerateCollectionNode_ processed and returned all 10k rows (documents), +as did the _CalculationNode_. Because the AQL execution engine also uses an +internal batch size of 1000 these blocks were also called 100 times each. +The _FilterNode_ as well as the _ReturnNode_ however only ever returned 10 rows +and only had to be called once, because the result size fits within a single batch. + +Let us add a persistent index on `value` to speed up the query: + +```js +db.acollection.ensureIndex({type:"persistent", fields:["value"]}); +``` + +```js +--- +name: 02_workWithAQL_profileQuerySimpleIndex +description: '' +--- +~db._create('acollection'); +~db.acollection.ensureIndex({type:"persistent", fields:["value"]}); +~for (let i=0; i < 10000; i++) { db.acollection.insert({value:i}); } +db._profileQuery(` + FOR doc IN acollection + FILTER doc.value < 10 + RETURN doc`, {}, {colors: false} +); +~db._drop("acollection"); +``` + +This results in replacing the collection scan and filter block with an +`IndexNode`. The execution pipeline of the AQL query has become much shorter. +Also the number of rows processed by each pipeline block is only 10, because +we no longer need to look at all documents. + +## Example: AQL with Subquery + +Let us consider a query containing a subquery: + +```js +--- +name: 03_workWithAQL_profileQuerySubquery +description: '' +--- +~db._create('acollection'); +~db.acollection.ensureIndex({type:"persistent", fields:["value"]}); +~for (let i=0; i < 10000;i++) { db.acollection.insert({value:i}); } +db._profileQuery(` + LET list = (FOR doc in acollection FILTER doc.value > 90 RETURN doc) + FOR a IN list + FILTER a.value < 91 + RETURN a`, {}, {colors: false, optimizer:{rules:["-all"]}} +); +~db._drop("acollection"); +``` + +The resulting query profile contains a _SubqueryNode_ which has the runtime of +all its children combined. + +Actually, we cheated a little. The optimizer would have completely removed the +subquery if it had not been deactivated (`rules:["-all"]`). The optimized +version would take longer in the "optimizing plan" stage, but should perform +better with a lot of results. + +## Example: AQL with Aggregation + +Let us try a more advanced query, using a [COLLECT](../high-level-operations/collect.md) +statement. Assume we have a user collection with each document having a city, +a username and an age attribute. + +The following query gets us all age groups in buckets (0-9, 10-19, 20-29, ...): + +```js +--- +name: 04_workWithAQL_profileQueryAggregation +description: '' +--- +~db._create('myusers'); +~["berlin", "paris", "cologne", "munich", "london"].forEach((c) => { ["peter", "david", "simon", "lars"].forEach( n => db.myusers.insert({ city : c, name : n, age: Math.floor(Math.random() * 75) }) ) }); +db._profileQuery(` + FOR u IN myusers + COLLECT ageGroup = FLOOR(u.age / 10) * 10 + AGGREGATE minAge = MIN(u.age), maxAge = MAX(u.age), len = LENGTH(u) + RETURN { + ageGroup, + minAge, + maxAge, + len + }`, {}, {colors: false} +); +~db._drop("myusers") +``` + +Without any indexes this query should have to perform the following operations: + +1. Perform a full collection scan via a _EnumerateCollectionNode_ and outputting + a row containing the document in `doc`. +2. Compute the expression `LET #1 = FLOOR(u.age / 10) * 10` for all inputs via + a _CalculationNode_ +3. Perform the aggregations via the _CollectNode_ +4. Sort the resulting aggregated rows via a _SortNode_ +5. Build a result value via another _CalculationNode_ +6. Put the result variable into the result set via the _ResultNode_ + +Like within the example above, you can see that after the _CalculationNode_ +stage, from the originally 20 rows only a handful remained. + +## Typical AQL Performance Mistakes + +With the new query profiler you should be able to spot typical performance +mistakes that we see quite often: + +- Not employing indexes to speed up queries with common filter expressions +- Not using shard keys in filter statements, when it is known + (only a cluster problem) +- Using subqueries to calculate an intermediary result, but only using a + few results + +Bad example: + +```aql +LET vertices = ( + FOR v IN 1..2 ANY @startVertex GRAPH 'my_graph' + // <-- add a LIMIT 1 here + RETURN v +) +FOR doc IN collection + FILTER doc.value == vertices[0].value + RETURN doc +``` + +Adding a `LIMIT 1` into the subquery should result in better performance, +because the traversal can be stopped after the first result instead of +computing all paths. + +Another mistake is to start a graph traversal from the wrong side +(if both ends are known). + +Assume we have two vertex collections _users_ and _products_ as well as an +edge collection _purchased_. The graph model looks like this: +`(users) <--[purchased]--> (products)`, i.e. every user is connected with an +edge in _purchased_ to zero or more _products_. + +If we want to know all users that have purchased the product _playstation_ +as well as products of `type` _legwarmer_ we could use this query: + +```aql +FOR prod IN products + FILTER prod.type == 'legwarmer' + FOR v,e,p IN 2..2 OUTBOUND prod purchased + FILTER v._key == 'playstation' // <-- last vertex of the path + RETURN p.vertices[1] // <-- the user +``` + +This query first finds all legwarmer products and then performs a traversal +for each of them. But we could also inverse the traversal by starting of with +the known _playstation_ product. This way we only need a single traversal +to achieve the same result: + +```aql +FOR v,e,p IN 2..2 OUTBOUND 'product/playstation' purchased + FILTER v.type == 'legwarmer' // <-- last vertex of the path + RETURN p.vertices[1] // <-- the user +``` diff --git a/site/content/arangodb/oem/aql/execution-and-performance/query-statistics.md b/site/content/arangodb/oem/aql/execution-and-performance/query-statistics.md new file mode 100644 index 0000000000..907a29dc30 --- /dev/null +++ b/site/content/arangodb/oem/aql/execution-and-performance/query-statistics.md @@ -0,0 +1,98 @@ +--- +title: AQL query statistics +menuTitle: Query statistics +weight: 5 +description: >- + All queries that have successfully run to completion return statistics about + the execution +--- +Execution statistics can be retrieved by calling `getExtra()` on the cursor. +The statistics are returned in the return value's `stats` attribute: + +```js +--- +name: 06_workWithAQL_statementsExtra +description: '' +--- +db._query(` + FOR i IN 1..@count + INSERT { _key: CONCAT('anothertest', TO_STRING(i)) } INTO mycollection`, + { count: 100 }, + {}, + { fullCount: true } +).getExtra(); + +db._query({ + "query": ` + FOR i IN 200..@count + INSERT { _key: CONCAT('anothertest', TO_STRING(i)) } INTO mycollection`, + "bindVars": { count: 300 }, + "options": { fullCount: true } +}).getExtra(); +``` + +The meaning of the statistics attributes is as follows: + +- **writesExecuted**: The total number of data-modification operations successfully executed. + This is equivalent to the number of documents created, updated, or removed by `INSERT`, + `UPDATE`, `REPLACE`, `REMOVE`, or `UPSERT` operations. +- **writesIgnored**: The total number of data-modification operations that were unsuccessful, + but have been ignored because of the `ignoreErrors` query option. +- **scannedFull**: The total number of documents iterated over when scanning a collection + without an index. Documents scanned by subqueries are included in the result, but + operations triggered by built-in or user-defined AQL functions are not. +- **scannedIndex**: The total number of documents iterated over when scanning a collection using + an index. Documents scanned by subqueries are included in the result, but operations + triggered by built-in or user-defined AQL functions are not. +- **cursorsCreated**: The total number of cursor objects created during query execution. Cursor + objects are created for index lookups. +- **cursorsRearmed**: The total number of times an existing cursor object was repurposed. + Repurposing an existing cursor object is normally more efficient compared to destroying an + existing cursor object and creating a new one from scratch. +- **cacheHits**: The total number of index entries read from in-memory caches for indexes + of type edge or persistent. This value is only non-zero when reading from indexes + that have an in-memory cache enabled, and when the query allows using the in-memory + cache (i.e. using equality lookups on all index attributes). +- **cacheMisses**: The total number of cache read attempts for index entries that could not + be served from in-memory caches for indexes of type edge or persistent. This value + is only non-zero when reading from indexes that have an in-memory cache enabled, the + query allows using the in-memory cache (i.e. using equality lookups on all index attributes) + and the looked up values are not present in the cache. +- **filtered**: The total number of documents removed after executing a filter condition + in a `FilterNode` or another node that post-filters data. Note that nodes of the + `IndexNode` type can also filter documents by selecting only the required index range + from a collection, and the `filtered` value only indicates how much filtering was done by a + post-filter in the `IndexNode` itself or following `FilterNode` nodes. + Nodes of the `EnumerateCollectionNode` and `TraversalNode` types can also apply + filter conditions and can report the number of filtered documents. +- **httpRequests**: The total number of cluster-internal HTTP requests performed. +- **fullCount** (_optional_): The total number of documents that matched the search condition if the query's + final top-level `LIMIT` operation were not present. + This attribute may only be returned if the `fullCount` option was set when starting the + query and only contains a sensible value if the query contains a `LIMIT` operation on + the top level. +- **executionTime**: The query execution time (wall-clock time) in seconds. +- **peakMemoryUsage**: The maximum memory usage of the query while it was running. In a cluster, + the memory accounting is done per shard, and the memory usage reported is the peak + memory usage value from the individual shards. + Note that to keep things light-weight, the per-query memory usage is tracked on a relatively + high level, not including any memory allocator overhead nor any memory used for temporary + results calculations (e.g. memory allocated/deallocated inside AQL expressions and function + calls). +- **intermediateCommits**: + The total number of intermediate commits the query has performed. This number + can only be greater than zero for data-modification queries that perform + modifications beyond the `--rocksdb.intermediate-commit-count` or + `--rocksdb.intermediate-commit-size` thresholds. In a cluster, the + intermediate commits are tracked per DB-Server that participates in the query + and are summed up in the end. +- **nodes** (_optional_): When the query is executed with the option `profile` set to at least `2`, + then this value contains runtime statistics per query execution node. + For a human readable output you can execute `db._profileQuery(, )` + in the arangosh. + - **id**: The execution node ID to correlate the statistics with the `plan` returned in + the `extra` attribute. + - **calls**: The number of calls to this node. + - **items**: The number of items returned by this node. Items are the temporary results + returned at this stage. + - **runtime**: The execution time of this node in seconds. diff --git a/site/content/arangodb/oem/aql/functions/_index.md b/site/content/arangodb/oem/aql/functions/_index.md new file mode 100644 index 0000000000..b38aa556de --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/_index.md @@ -0,0 +1,37 @@ +--- +title: AQL functions +menuTitle: Functions +weight: 30 +description: >- + AQL offers an extensive set of functions to allow for complex computations + and it supports user-defined functions +--- +Functions can be called at any query position where an expression is allowed. +The general function call syntax is: + +```aql +FUNCTIONNAME(arguments) +``` + +`FUNCTIONNAME` is the name of the function to be called, and `arguments` +is a comma-separated list of function arguments. If a function does not need any +arguments, the argument list can be left empty. However, even if the argument +list is empty, the parentheses around it are still mandatory to make function +calls distinguishable from variable names. + +Some example function calls: + +```aql +HAS(user, "name") +LENGTH(friends) +COLLECTIONS() +``` + +In contrast to collection and variable names, function names are case-insensitive, +i.e. `LENGTH(foo)` and `length(foo)` are equivalent. + +## Extending AQL + +It is possible to extend AQL with user-defined functions. These functions need to +be written in JavaScript, and have to be registered before they can be used in a query. +Please refer to [Extending AQL](../user-defined-functions.md) for more details. diff --git a/site/content/arangodb/oem/aql/functions/arangosearch.md b/site/content/arangodb/oem/aql/functions/arangosearch.md new file mode 100644 index 0000000000..2672ed25dd --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/arangosearch.md @@ -0,0 +1,1361 @@ +--- +title: ArangoSearch functions in AQL +menuTitle: ArangoSearch +weight: 5 +description: >- + ArangoSearch offers various AQL functions for search queries to control the search context, for filtering and scoring +pageToc: + maxHeadlineLevel: 3 +--- +You can form search expressions by composing ArangoSearch function calls, +logical operators and comparison operators. This allows you to filter Views +as well as to utilize inverted indexes to filter collections. + +The AQL [`SEARCH` operation](../high-level-operations/search.md) accepts search expressions, +such as `PHRASE(doc.text, "foo bar", "text_en")`, for querying Views. You can +combine ArangoSearch filter and context functions as well as operators like +`AND` and `OR` to form complex search conditions. Similarly, the +[`FILTER` operation](../high-level-operations/filter.md) accepts such search expressions +when using [inverted indexes](../../index-and-search/indexing/working-with-indexes/inverted-indexes.md). + +Scoring functions allow you to rank matches and to sort results by relevance. +They are limited to Views. + +Search highlighting functions let you retrieve the string positions of matches. +They are limited to Views. + +You can use most functions also without an inverted index or a View and the +`SEARCH` keyword, but then they are not accelerated by an index. + +See [Information Retrieval with ArangoSearch](../../index-and-search/arangosearch/_index.md) for an +introduction. + +## Context Functions + +### ANALYZER() + +`ANALYZER(expr, analyzer) → retVal` + +Sets the Analyzer for the given search expression. + +{{< info >}} +The `ANALYZER()` function is only applicable for queries against `arangosearch` Views. + +In queries against `search-alias` Views and inverted indexes, you don't need to +specify Analyzers because every field can be indexed with a single Analyzer only +and they are inferred from the index definition. +{{< /info >}} + +The default Analyzer is `identity` for any search expression that is used for +filtering `arangosearch` Views. This utility function can be used +to wrap a complex expression to set a particular Analyzer. It also sets it for +all the nested functions which require such an argument to avoid repeating the +Analyzer parameter. If an Analyzer argument is passed to a nested function +regardless, then it takes precedence over the Analyzer set via `ANALYZER()`. + +The `TOKENS()` function is an exception. It requires the Analyzer name to be +passed in in all cases even if wrapped in an `ANALYZER()` call, because it is +not an ArangoSearch function but a regular string function which can be used +outside of `SEARCH` operations. + +- **expr** (expression): any valid search expression +- **analyzer** (string): name of an [Analyzer](../../index-and-search/analyzers.md). +- returns **retVal** (any): the expression result that it wraps + +#### Example: Using a custom Analyzer + +Assuming a View definition with an Analyzer whose name and type is `delimiter`: + +```json +{ + "links": { + "coll": { + "analyzers": [ "delimiter" ], + "includeAllFields": true, + } + }, + ... +} +``` + +… with the Analyzer properties `{ "delimiter": "|" }` and an example document +`{ "text": "foo|bar|baz" }` in the collection `coll`, the following query would +return the document: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(doc.text == "bar", "delimiter") + RETURN doc +``` + +The expression `doc.text == "bar"` has to be wrapped by `ANALYZER()` in order +to set the Analyzer to `delimiter`. Otherwise the expression would be evaluated +with the default `identity` Analyzer. `"foo|bar|baz" == "bar"` would not match, +but the View does not even process the indexed fields with the `identity` +Analyzer. The following query would also return an empty result because of +the Analyzer mismatch: + +```aql +FOR doc IN viewName + SEARCH doc.text == "foo|bar|baz" + //SEARCH ANALYZER(doc.text == "foo|bar|baz", "identity") + RETURN doc +``` + +#### Example: Setting the Analyzer context with and without `ANALYZER()` + +In below query, the search expression is swapped by `ANALYZER()` to set the +`text_en` Analyzer for both `PHRASE()` functions: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(PHRASE(doc.text, "foo") OR PHRASE(doc.text, "bar"), "text_en") + RETURN doc +``` + +Without the usage of `ANALYZER()`: + +```aql +FOR doc IN viewName + SEARCH PHRASE(doc.text, "foo", "text_en") OR PHRASE(doc.text, "bar", "text_en") + RETURN doc +``` + +#### Example: Analyzer precedence and specifics of the `TOKENS()` function + +In the following example `ANALYZER()` is used to set the Analyzer `text_en`, +but in the second call to `PHRASE()` a different Analyzer is set (`identity`) +which overrules `ANALYZER()`. Therefore, the `text_en` Analyzer is used to find +the phrase *foo* and the `identity` Analyzer to find *bar*: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(PHRASE(doc.text, "foo") OR PHRASE(doc.text, "bar", "identity"), "text_en") + RETURN doc +``` + +Despite the wrapping `ANALYZER()` function, the Analyzer name cannot be +omitted in calls to the `TOKENS()` function. Both occurrences of `text_en` +are required, to set the Analyzer for the expression `doc.text IN ...` and +for the `TOKENS()` function itself. This is because the `TOKENS()` function +is a regular string function that does not take the Analyzer context into +account: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(doc.text IN TOKENS("foo", "text_en"), "text_en") + RETURN doc +``` + +### BOOST() + +`BOOST(expr, boost) → retVal` + +Override boost in the context of a search expression with a specified value, +making it available for scorer functions. By default, the context has a boost +value equal to `1.0`. + +- **expr** (expression): any valid search expression +- **boost** (number): numeric boost value +- returns **retVal** (any): the expression result that it wraps + +#### Example: Boosting a search sub-expression + +```aql +FOR doc IN viewName + SEARCH ANALYZER(BOOST(doc.text == "foo", 2.5) OR doc.text == "bar", "text_en") + LET score = BM25(doc) + SORT score DESC + RETURN { text: doc.text, score } +``` + +Assuming a View with the following documents indexed and processed by the +`text_en` Analyzer: + +```js +{ "text": "foo bar" } +{ "text": "foo" } +{ "text": "bar" } +{ "text": "foo baz" } +{ "text": "baz" } +``` + +… the result of above query would be: + +```json +[ + { + "text": "foo bar", + "score": 2.787301540374756 + }, + { + "text": "foo baz", + "score": 1.6895781755447388 + }, + { + "text": "foo", + "score": 1.525835633277893 + }, + { + "text": "bar", + "score": 0.9913395643234253 + } +] +``` + +## Filter Functions + +### EXISTS() + +{{< info >}} +If you use `arangosearch` Views, the `EXISTS()` function only matches values if +you set the **storeValues** link property to `"id"` in the View definition +(the default is `"none"`). +{{< /info >}} + +#### Testing for attribute presence + +`EXISTS(path)` + +Match documents where the attribute at `path` is present. + +- **path** (attribute path expression): the attribute to test in the document +- returns nothing: the function evaluates to a boolean, but this value cannot be + returned. The function can only be called in a search expression. It throws + an error if used outside of a [`SEARCH` operation](../high-level-operations/search.md) or + a `FILTER` operation that uses an inverted index. + +```aql +FOR doc IN viewName + SEARCH EXISTS(doc.text) + RETURN doc +``` + +#### Testing for attribute type + +`EXISTS(path, type)` + +Match documents where the attribute at `path` is present _and_ is of the +specified data type. + +- **path** (attribute path expression): the attribute to test in the document +- **type** (string): data type to test for, can be one of: + - `"null"` + - `"bool"` / `"boolean"` + - `"numeric"` + - `"type"` (matches `null`, `boolean`, and `numeric` values) + - `"string"` + - `"analyzer"` (see below) +- returns nothing: the function evaluates to a boolean, but this value cannot be + returned. The function can only be called in a search expression. It throws + an error if used outside of a [`SEARCH` operation](../high-level-operations/search.md) or + a `FILTER` operation that uses an inverted index. + +```aql +FOR doc IN viewName + SEARCH EXISTS(doc.text, "string") + RETURN doc +``` + +#### Testing for Analyzer index status + +`EXISTS(path, "analyzer", analyzer)` + +Match documents where the attribute at `path` is present _and_ was indexed +by the specified `analyzer`. + +- **path** (attribute path expression): the attribute to test in the document +- **type** (string): string literal `"analyzer"` +- **analyzer** (string, _optional_): name of an [Analyzer](../../index-and-search/analyzers.md). + Uses the Analyzer of a wrapping `ANALYZER()` call if not specified or + defaults to `"identity"` +- returns nothing: the function evaluates to a boolean, but this value cannot be + returned. The function can only be called in a search expression. It throws + an error if used outside of a [`SEARCH` operation](../high-level-operations/search.md) or + a `FILTER` operation that uses an inverted index. + +```aql +FOR doc IN viewName + SEARCH EXISTS(doc.text, "analyzer", "text_en") + RETURN doc +``` + +#### Testing for nested fields + +`EXISTS(path, "nested")` + +Match documents where the attribute at `path` is present _and_ is indexed +as a nested field for [nested search with Views](../../index-and-search/arangosearch/nested-search.md) +or [inverted indexes](../../index-and-search/indexing/working-with-indexes/inverted-indexes.md#nested-search-enterprise-edition). + +- **path** (attribute path expression): the attribute to test in the document +- **type** (string): string literal `"nested"` +- returns nothing: the function evaluates to a boolean, but this value cannot be + returned. The function can only be called in a search expression. It throws + an error if used outside of a [`SEARCH` operation](../high-level-operations/search.md) or + a `FILTER` operation that uses an inverted index. + +**Examples** + +Only return documents from the View `viewName` whose `text` attribute is indexed +as a nested field: + +```aql +FOR doc IN viewName + SEARCH EXISTS(doc.text, "nested") + RETURN doc +``` + +Only return documents whose `attr` attribute and its nested `text` attribute are +indexed as nested fields: + +```aql +FOR doc IN viewName + SEARCH doc.attr[? FILTER EXISTS(CURRENT.text, "nested")] + RETURN doc +``` + +Only return documents from the collection `coll` whose `text` attribute is indexed +as a nested field by an inverted index: + +```aql +FOR doc IN coll OPTIONS { indexHint: "inv-idx", forceIndexHint: true } + FILTER EXISTS(doc.text, "nested") + RETURN doc +``` + +Only return documents whose `attr` attribute and its nested `text` attribute are +indexed as nested fields: + +```aql +FOR doc IN coll OPTIONS { indexHint: "inv-idx", forceIndexHint: true } + FILTER doc.attr[? FILTER EXISTS(CURRENT.text, "nested")] + RETURN doc +``` + +### IN_RANGE() + +`IN_RANGE(path, low, high, includeLow, includeHigh) → included` + +Match documents where the attribute at `path` is greater than (or equal to) +`low` and less than (or equal to) `high`. + +You can use `IN_RANGE()` for searching more efficiently compared to an equivalent +expression that combines two comparisons with a logical conjunction: + +- `IN_RANGE(path, low, high, true, true)` instead of `low <= value AND value <= high` +- `IN_RANGE(path, low, high, true, false)` instead of `low <= value AND value < high` +- `IN_RANGE(path, low, high, false, true)` instead of `low < value AND value <= high` +- `IN_RANGE(path, low, high, false, false)` instead of `low < value AND value < high` + +`low` and `high` can be numbers or strings (technically also `null`, `true` +and `false`), but the data type must be the same for both. + +{{< warning >}} +The alphabetical order of characters is not taken into account by ArangoSearch, +i.e. range queries in SEARCH operations against Views will not follow the +language rules as per the defined Analyzer locale (except for the +[`collation` Analyzer](../../index-and-search/analyzers.md#collation)) nor the server language +(startup option `--default-language`)! +Also see [Known Issues](../../release-notes/version-3.11/known-issues-in-3-11.md#arangosearch). +{{< /warning >}} + +There is a corresponding [`IN_RANGE()` Miscellaneous Function](miscellaneous.md#in_range) +that is used outside of `SEARCH` operations. + +- **path** (attribute path expression): + the path of the attribute to test in the document +- **low** (number\|string): minimum value of the desired range +- **high** (number\|string): maximum value of the desired range +- **includeLow** (bool): whether the minimum value shall be included in + the range (left-closed interval) or not (left-open interval) +- **includeHigh** (bool): whether the maximum value shall be included in + the range (right-closed interval) or not (right-open interval) +- returns **included** (bool): whether `value` is in the range + +If `low` and `high` are the same, but `includeLow` and/or `includeHigh` is set +to `false`, then nothing will match. If `low` is greater than `high` nothing will +match either. + +#### Example: Using numeric ranges + +To match documents with the attribute `value >= 3` and `value <= 5` using the +default `"identity"` Analyzer you would write the following query: + +```aql +FOR doc IN viewName + SEARCH IN_RANGE(doc.value, 3, 5, true, true) + RETURN doc.value +``` + +This will also match documents which have an array of numbers as `value` +attribute where at least one of the numbers is in the specified boundaries. + +#### Example: Using string ranges + +Using string boundaries and a text Analyzer allows to match documents which +have at least one token within the specified character range: + +```aql +FOR doc IN valView + SEARCH ANALYZER(IN_RANGE(doc.value, "a","f", true, false), "text_en") + RETURN doc +``` + +This will match `{ "value": "bar" }` and `{ "value": "foo bar" }` because the +_b_ of _bar_ is in the range (`"a" <= "b" < "f"`), but not `{ "value": "foo" }` +because the _f_ of _foo_ is excluded (`high` is "f" but `includeHigh` is false). + +### MIN_MATCH() + +`MIN_MATCH(expr1, ... exprN, minMatchCount) → fulfilled` + +Match documents where at least `minMatchCount` of the specified +search expressions are satisfied. + +There is a corresponding [`MIN_MATCH()` Miscellaneous function](miscellaneous.md#min_match) +that is used outside of `SEARCH` operations. + +- **expr** (expression, _repeatable_): any valid search expression +- **minMatchCount** (number): minimum number of search expressions that should + be satisfied +- returns **fulfilled** (bool): whether at least `minMatchCount` of the + specified expressions are `true` + +#### Example: Matching a subset of search sub-expressions + +Assuming a View with a text Analyzer, you may use it to match documents where +the attribute contains at least two out of three tokens: + +```aql +LET t = TOKENS("quick brown fox", "text_en") +FOR doc IN viewName + SEARCH ANALYZER(MIN_MATCH(doc.text == t[0], doc.text == t[1], doc.text == t[2], 2), "text_en") + RETURN doc.text +``` + +This will match `{ "text": "the quick brown fox" }` and `{ "text": "some brown fox" }`, +but not `{ "text": "snow fox" }` which only fulfills one of the conditions. + +Note that you can also use the `AT LEAST` [array comparison operator](../high-level-operations/search.md#array-comparison-operators) +in the specific case of matching a subset of tokens against a single attribute: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(TOKENS("quick brown fox", "text_en") AT LEAST (2) == doc.text, "text_en") + RETURN doc.text +``` + +### MINHASH_MATCH() + +`MINHASH_MATCH(path, target, threshold, analyzer) → fulfilled` + +Match documents with an approximate Jaccard similarity of at least the +`threshold`, approximated with the specified `minhash` Analyzer. + +To only compute the MinHash signatures, see the +[`MINHASH()` Miscellaneous function](miscellaneous.md#minhash). + +- **path** (attribute path expression\|string): the path of the attribute in + a document or a string +- **target** (string): the string to hash with the specified Analyzer and to + compare against the stored attribute +- **threshold** (number, _optional_): a value between `0.0` and `1.0`. +- **analyzer** (string): the name of a [`minhash` Analyzer](../../index-and-search/analyzers.md#minhash). +- returns **fulfilled** (bool): `true` if the approximate Jaccard similarity + is greater than or equal to the specified threshold, `false` otherwise + +#### Example: Find documents with a text similar to a target text + +Assuming a View with a `minhash` Analyzer, you can use the stored +MinHash signature to find candidates for the more expensive Jaccard similarity +calculation: + +```aql +LET target = "the quick brown fox jumps over the lazy dog" +LET targetSignature = TOKENS(target, "myMinHash") + +FOR doc IN viewName + SEARCH MINHASH_MATCH(doc.text, target, 0.5, "myMinHash") // approximation + LET jaccard = JACCARD(targetSignature, TOKENS(doc.text, "myMinHash")) + FILTER jaccard > 0.75 + SORT jaccard DESC + RETURN doc.text +``` + +### NGRAM_MATCH() + +`NGRAM_MATCH(path, target, threshold, analyzer) → fulfilled` + +Match documents whose attribute value has an +[_n_-gram similarity](https://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf) +higher than the specified threshold compared to the target value. + +The similarity is calculated by counting how long the longest sequence of +matching _n_-grams is, divided by the target's total _n_-gram count. +Only fully matching _n_-grams are counted. + +The _n_-grams for both attribute and target are produced by the specified +Analyzer. Increasing the _n_-gram length will increase accuracy, but reduce +error tolerance. In most cases a size of 2 or 3 will be a good choice. + +Also see the String Functions +[`NGRAM_POSITIONAL_SIMILARITY()`](string.md#ngram_positional_similarity) +and [`NGRAM_SIMILARITY()`](string.md#ngram_similarity) +for calculating _n_-gram similarity that cannot be accelerated by a View index. + +- **path** (attribute path expression\|string): the path of the attribute in + a document or a string +- **target** (string): the string to compare against the stored attribute +- **threshold** (number, _optional_): a value between `0.0` and `1.0`. Defaults + to `0.7` if none is specified. +- **analyzer** (string): the name of an [Analyzer](../../index-and-search/analyzers.md). +- returns **fulfilled** (bool): `true` if the evaluated _n_-gram similarity value + is greater than or equal to the specified threshold, `false` otherwise + +{{< info >}} +Use an Analyzer of type `ngram` with `preserveOriginal: false` and `min` equal +to `max`. Otherwise, the similarity score calculated internally will be lower +than expected. + +The Analyzer must have the `"position"` and `"frequency"` features enabled or +the `NGRAM_MATCH()` function will not find anything. +{{< /info >}} + +#### Example: Using a custom bigram Analyzer + +Given a View indexing an attribute `text`, a custom _n_-gram Analyzer `"bigram"` +(`min: 2, max: 2, preserveOriginal: false, streamType: "utf8"`) and a document +`{ "text": "quick red fox" }`, the following query would match it (with a +threshold of `1.0`): + +```aql +FOR doc IN viewName + SEARCH NGRAM_MATCH(doc.text, "quick fox", "bigram") + RETURN doc.text +``` + +The following will also match (note the low threshold value): + +```aql +FOR doc IN viewName + SEARCH NGRAM_MATCH(doc.text, "quick blue fox", 0.4, "bigram") + RETURN doc.text +``` + +The following will not match (note the high threshold value): + +```aql +FOR doc IN viewName + SEARCH NGRAM_MATCH(doc.text, "quick blue fox", 0.9, "bigram") + RETURN doc.text +``` + +#### Example: Using constant values + +`NGRAM_MATCH()` can be called with constant arguments, but for such calls the +`analyzer` argument is mandatory (even for calls inside of a `SEARCH` clause): + +```aql +FOR doc IN viewName + SEARCH NGRAM_MATCH("quick fox", "quick blue fox", 0.9, "bigram") + RETURN doc.text +``` + +```aql +RETURN NGRAM_MATCH("quick fox", "quick blue fox", "bigram") +``` + +### PHRASE() + +`PHRASE(path, phrasePart, analyzer)` + +`PHRASE(path, phrasePart1, skipTokens1, ... phrasePartN, skipTokensN, analyzer)` + +`PHRASE(path, [ phrasePart1, skipTokens1, ... phrasePartN, skipTokensN ], analyzer)` + +Search for a phrase in the referenced attribute. It only matches documents in +which the tokens appear in the specified order. To search for tokens in any +order use [`TOKENS()`](string.md#tokens) instead. + +The phrase can be expressed as an arbitrary number of `phraseParts` separated by +*skipTokens* number of tokens (wildcards), either as separate arguments or as +array as second argument. + +- **path** (attribute path expression): the attribute to test in the document +- **phrasePart** (string\|array\|object): text to search for in the tokens. + Can also be an [array](#example-using-phrase-with-an-array-of-tokens) + comprised of string, array and [object tokens](#object-tokens), or tokens + interleaved with numbers of `skipTokens`. The specified `analyzer` is applied + to string and array tokens, but not for object tokens. +- **skipTokens** (number, _optional_): amount of tokens to treat + as wildcards +- **analyzer** (string, _optional_): name of an [Analyzer](../../index-and-search/analyzers.md). + Uses the Analyzer of a wrapping `ANALYZER()` call if not specified or + defaults to `"identity"` +- returns nothing: the function evaluates to a boolean, but this value cannot be + returned. The function can only be called in a search expression. It throws + an error if used outside of a [`SEARCH` operation](../high-level-operations/search.md) or + a `FILTER` operation that uses an inverted index. + +{{< info >}} +The selected Analyzer must have the `"position"` and `"frequency"` features +enabled. The `PHRASE()` function will otherwise not find anything. +{{< /info >}} + +#### Object tokens + +- `{IN_RANGE: [low, high, includeLow, includeHigh]}`: + see [`IN_RANGE()`](#in_range). *low* and *high* can only be strings. +- `{LEVENSHTEIN_MATCH: [token, maxDistance, transpositions, maxTerms, prefix]}`: + - `token` (string): a string to search + - `maxDistance` (number): maximum Levenshtein / Damerau-Levenshtein distance + - `transpositions` (bool, _optional_): if set to `false`, a Levenshtein + distance is computed, otherwise a Damerau-Levenshtein distance (default) + - `maxTerms` (number, _optional_): consider only a specified number of the + most relevant terms. One can pass `0` to consider all matched terms, but it may + impact performance negatively. The default value is `64`. + - `prefix` (string, _optional_): if defined, then a search for the exact + prefix is carried out, using the matches as candidates. The Levenshtein / + Damerau-Levenshtein distance is then computed for each candidate using the + remainders of the strings. This option can improve performance in cases where + there is a known common prefix. The default value is an empty string + (introduced in v3.7.13, v3.8.1). +- `{STARTS_WITH: [prefix]}`: see [`STARTS_WITH()`](#starts_with). + Array brackets are optional +- `{TERM: [token]}`: equal to `token` but without Analyzer tokenization. + Array brackets are optional +- `{TERMS: [token1, ..., tokenN]}`: one of `token1, ..., tokenN` can be found + in specified position. Inside an array the object syntax can be replaced with + the object field value, e.g., `[..., [token1, ..., tokenN], ...]`. +- `{WILDCARD: [token]}`: see [`LIKE()`](#like). + Array brackets are optional + +An array token inside an array can be used in the `TERMS` case only. + +Also see [Example: Using object tokens](#example-using-object-tokens). + +#### Example: Using a text Analyzer for a phrase search + +Given a View indexing an attribute `text` with the `"text_en"` Analyzer and a +document `{ "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit" }`, +the following query would match it: + +```aql +FOR doc IN viewName + SEARCH PHRASE(doc.text, "lorem ipsum", "text_en") + RETURN doc.text +``` + +However, this search expression does not because the tokens `"ipsum"` and +`"lorem"` do not appear in this order: + +```aql +PHRASE(doc.text, "ipsum lorem", "text_en") +``` + +#### Example: Skip tokens for a proximity search + +To match `"ipsum"` and `"amet"` with any two tokens in between, you can use the +following search expression: + +```aql +PHRASE(doc.text, "ipsum", 2, "amet", "text_en") +``` + +The `skipTokens` value of `2` defines how many wildcard tokens have to appear +between *ipsum* and *amet*. A `skipTokens` value of `0` means that the tokens +must be adjacent. Negative values are allowed, but not very useful. These three +search expressions are equivalent: + +```aql +PHRASE(doc.text, "lorem ipsum", "text_en") +PHRASE(doc.text, "lorem", 0, "ipsum", "text_en") +PHRASE(doc.text, "ipsum", -1, "lorem", "text_en") +``` + +#### Example: Using `PHRASE()` with an array of tokens + +The `PHRASE()` function also accepts an array as second argument with +`phrasePart` and `skipTokens` parameters as elements. + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, ["quick brown fox"], "text_en") RETURN doc +FOR doc IN myView SEARCH PHRASE(doc.title, ["quick", "brown", "fox"], "text_en") RETURN doc +``` + +This syntax variation enables the usage of computed expressions: + +```aql +LET proximityCondition = [ "foo", ROUND(RAND()*10), "bar" ] +FOR doc IN viewName + SEARCH PHRASE(doc.text, proximityCondition, "text_en") + RETURN doc +``` + +```aql +LET tokens = TOKENS("quick brown fox", "text_en") // ["quick", "brown", "fox"] +FOR doc IN myView SEARCH PHRASE(doc.title, tokens, "text_en") RETURN doc +``` + +Above example is equivalent to the more cumbersome and static form: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, "quick", 0, "brown", 0, "fox", "text_en") RETURN doc +``` + +You can optionally specify the number of skipTokens in the array form before +every string element: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, ["quick", 1, "fox", "jumps"], "text_en") RETURN doc +``` + +It is the same as the following: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, "quick", 1, "fox", 0, "jumps", "text_en") RETURN doc +``` + +#### Example: Handling of arrays with no members + +Empty arrays are skipped: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, "quick", 1, [], 1, "jumps", "text_en") RETURN doc +``` + +The query is equivalent to: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, "quick", 2 "jumps", "text_en") RETURN doc +``` + +Providing only empty arrays is valid, but will yield no results. + +#### Example: Using object tokens + +Using object tokens `STARTS_WITH`, `WILDCARD`, `LEVENSHTEIN_MATCH`, `TERMS` and +`IN_RANGE`: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, + {STARTS_WITH: ["qui"]}, 0, + {WILDCARD: ["b%o_n"]}, 0, + {LEVENSHTEIN_MATCH: ["foks", 2]}, 0, + {TERMS: ["jump", "run"]}, 0, // Analyzer not applied! + {IN_RANGE: ["over", "through", true, false]}, + "text_en") RETURN doc +``` + +Note that the `text_en` Analyzer has stemming enabled, but for object tokens +the Analyzer isn't applied. `{TERMS: ["jumps", "runs"]}` would not match the +indexed (and stemmed!) attribute value. Therefore, the trailing `s` which would +be stemmed away is removed from both words manually in the example. + +Above example is equivalent to: + +```aql +FOR doc IN myView SEARCH PHRASE(doc.title, +[ + {STARTS_WITH: "qui"}, 0, + {WILDCARD: "b%o_n"}, 0, + {LEVENSHTEIN_MATCH: ["foks", 2]}, 0, + ["jumps", "runs"], 0, // Analyzer is applied using this syntax + {IN_RANGE: ["over", "through", true, false]} +], "text_en") RETURN doc +``` + +### STARTS_WITH() + +`STARTS_WITH(path, prefix) → startsWith` + +Match the value of the attribute that starts with `prefix`. If the attribute +is processed by a tokenizing Analyzer (type `"text"` or `"delimiter"`) or if it +is an array, then a single token/element starting with the prefix is sufficient +to match the document. + +{{< warning >}} +The alphabetical order of characters is not taken into account by ArangoSearch, +i.e. range queries in SEARCH operations against Views will not follow the +language rules as per the defined Analyzer locale (except for the +[`collation` Analyzer](../../index-and-search/analyzers.md#collation)) nor the server language +(startup option `--default-language`)! +Also see [Known Issues](../../release-notes/version-3.11/known-issues-in-3-11.md#arangosearch). +{{< /warning >}} + +There is a corresponding [`STARTS_WITH()` String function](string.md#starts_with) +that is used outside of `SEARCH` operations. + +- **path** (attribute path expression): the path of the attribute to compare + against in the document +- **prefix** (string): a string to search at the start of the text +- returns **startsWith** (bool): whether the specified attribute starts with + the given prefix + +--- + +`STARTS_WITH(path, prefixes, minMatchCount) → startsWith` + +Match the value of the attribute that starts with one of the `prefixes`, or +optionally with at least `minMatchCount` of the prefixes. + +- **path** (attribute path expression): the path of the attribute to compare + against in the document +- **prefixes** (array): an array of strings to search at the start of the text +- **minMatchCount** (number, _optional_): minimum number of search prefixes + that should be satisfied (see + [example](#example-searching-for-one-or-multiple-prefixes)). The default is `1` +- returns **startsWith** (bool): whether the specified attribute starts with at + least `minMatchCount` of the given prefixes + +#### Example: Searching for an exact value prefix + +To match a document `{ "text": "lorem ipsum..." }` using a prefix and the +`"identity"` Analyzer you can use it like this: + +```aql +FOR doc IN viewName + SEARCH STARTS_WITH(doc.text, "lorem ip") + RETURN doc +``` + +#### Example: Searching for a prefix in text + +This query will match `{ "text": "lorem ipsum" }` as well as +`{ "text": [ "lorem", "ipsum" ] }` given a View which indexes the `text` +attribute and processes it with the `"text_en"` Analyzer: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(STARTS_WITH(doc.text, "ips"), "text_en") + RETURN doc.text +``` + +Note that it will not match `{ "text": "IPS (in-plane switching)" }` without +modification to the query. The prefixes were passed to `STARTS_WITH()` as-is, +but the built-in `text_en` Analyzer used for indexing has stemming enabled. +So the indexed values are the following: + +```aql +RETURN TOKENS("IPS (in-plane switching)", "text_en") +``` + +```json +[ + [ + "ip", + "in", + "plane", + "switch" + ] +] +``` + +The *s* is removed from *ips*, which leads to the prefix *ips* not matching +the indexed token *ip*. You may either create a custom text Analyzer with +stemming disabled to avoid this issue, or apply stemming to the prefixes: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(STARTS_WITH(doc.text, TOKENS("ips", "text_en")), "text_en") + RETURN doc.text +``` + +#### Example: Searching for one or multiple prefixes + +The `STARTS_WITH()` function accepts an array of prefix alternatives of which +only one has to match: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(STARTS_WITH(doc.text, ["something", "ips"]), "text_en") + RETURN doc.text +``` + +It will match a document `{ "text": "lorem ipsum" }` but also +`{ "text": "that is something" }`, as at least one of the words start with a +given prefix. + +The same query again, but with an explicit `minMatchCount`: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(STARTS_WITH(doc.text, ["wrong", "ips"], 1), "text_en") + RETURN doc.text +``` + +The number can be increased to require that at least this many prefixes must +be present: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(STARTS_WITH(doc.text, ["lo", "ips", "something"], 2), "text_en") + RETURN doc.text +``` + +This will still match `{ "text": "lorem ipsum" }` because at least two prefixes +(`lo` and `ips`) are found, but not `{ "text": "that is something" }` which only +contains one of the prefixes (`something`). + +### LEVENSHTEIN_MATCH() + +`LEVENSHTEIN_MATCH(path, target, distance, transpositions, maxTerms, prefix) → fulfilled` + +Match documents with a [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +lower than or equal to `distance` between the stored attribute value and +`target`. It can optionally match documents using a pure Levenshtein distance. + +See [`LEVENSHTEIN_DISTANCE()`](string.md#levenshtein_distance) +if you want to calculate the edit distance of two strings. + +- **path** (attribute path expression\|string): the path of the attribute to + compare against in the document or a string +- **target** (string): the string to compare against the stored attribute +- **distance** (number): the maximum edit distance, which can be between + `0` and `4` if `transpositions` is `false`, and between `0` and `3` if + it is `true` +- **transpositions** (bool, _optional_): if set to `false`, a Levenshtein + distance is computed, otherwise a Damerau-Levenshtein distance (default) +- **maxTerms** (number, _optional_): consider only a specified number of the + most relevant terms. One can pass `0` to consider all matched terms, but it may + impact performance negatively. The default value is `64`. +- returns **fulfilled** (bool): `true` if the calculated distance is less than + or equal to *distance*, `false` otherwise +- **prefix** (string, _optional_): if defined, then a search for the exact + prefix is carried out, using the matches as candidates. The Levenshtein / + Damerau-Levenshtein distance is then computed for each candidate using + the `target` value and the remainders of the strings, which means that the + **prefix needs to be removed from `target`** (see + [example](#example-matching-with-prefix-search)). This option can improve + performance in cases where there is a known common prefix. The default value + is an empty string (introduced in v3.7.13, v3.8.1). + +#### Example: Matching with and without transpositions + +The Levenshtein distance between _quick_ and _quikc_ is `2` because it requires +two operations to go from one to the other (remove _k_, insert _k_ at a +different position). + +```aql +FOR doc IN viewName + SEARCH LEVENSHTEIN_MATCH(doc.text, "quikc", 2, false) // matches "quick" + RETURN doc.text +``` + +The Damerau-Levenshtein distance is `1` (move _k_ to the end). + +```aql +FOR doc IN viewName + SEARCH LEVENSHTEIN_MATCH(doc.text, "quikc", 1) // matches "quick" + RETURN doc.text +``` + +#### Example: Matching with prefix search + +Match documents with a Levenshtein distance of 1 with the prefix `qui`. The edit +distance is calculated using the search term `kc` (`quikc` with the prefix `qui` +removed) and the stored value without the prefix (e.g. `ck`). The prefix `qui` +is constant. + +```aql +FOR doc IN viewName + SEARCH LEVENSHTEIN_MATCH(doc.text, "kc", 1, false, 64, "qui") // matches "quick" + RETURN doc.text +``` + +You may compute the prefix and suffix from the input string as follows: + +```aql +LET input = "quikc" +LET prefixSize = 3 +LET prefix = LEFT(input, prefixSize) +LET suffix = SUBSTRING(input, prefixSize) +FOR doc IN viewName + SEARCH LEVENSHTEIN_MATCH(doc.text, suffix, 1, false, 64, prefix) // matches "quick" + RETURN doc.text +``` + +#### Example: Basing the edit distance on string length + +You may want to pick the maximum edit distance based on string length. +If the stored attribute is the string _quick_ and the target string is +_quicksands_, then the Levenshtein distance is 5, with 50% of the +characters mismatching. If the inputs are _q_ and _qu_, then the distance +is only 1, although it is also a 50% mismatch. + +```aql +LET target = "input" +LET targetLength = LENGTH(target) +LET maxDistance = (targetLength > 5 ? 2 : (targetLength >= 3 ? 1 : 0)) +FOR doc IN viewName + SEARCH LEVENSHTEIN_MATCH(doc.text, target, maxDistance, true) + RETURN doc.text +``` + +### LIKE() + +`LIKE(path, search) → bool` + +Check whether the pattern `search` is contained in the attribute denoted by `path`, +using wildcard matching. + +- `_`: A single arbitrary character +- `%`: Zero, one or many arbitrary characters +- `\\_`: A literal underscore +- `\\%`: A literal percent sign + +{{< info >}} +Literal backlashes require different amounts of escaping depending on the +context: +- `\` in bind variables (_Table_ view mode) in the web interface (automatically + escaped to `\\` unless the value is wrapped in double quotes and already + escaped properly) +- `\\` in bind variables (_JSON_ view mode) and queries in the web interface +- `\\` in bind variables in arangosh +- `\\\\` in queries in arangosh +- Double the amount compared to arangosh in shells that use backslashes for +escaping (`\\\\` in bind variables and `\\\\\\\\` in queries) +{{< /info >}} + +Searching with the `LIKE()` function in the context of a `SEARCH` operation +is backed by View indexes. The [String `LIKE()` function](string.md#like) +is used in other contexts such as in `FILTER` operations and cannot be +accelerated by any sort of index on the other hand. Another difference is that +the ArangoSearch variant does not accept a third argument to enable +case-insensitive matching. This can be controlled with Analyzers instead. + +- **path** (attribute path expression): the path of the attribute to compare + against in the document +- **search** (string): a search pattern that can contain the wildcard characters + `%` (meaning any sequence of characters, including none) and `_` (any single + character). Literal `%` and `_` must be escaped with backslashes. +- returns **bool** (bool): `true` if the pattern is contained in `text`, + and `false` otherwise + +#### Example: Searching with wildcards + +```aql +FOR doc IN viewName + SEARCH ANALYZER(LIKE(doc.text, "foo%b_r"), "text_en") + RETURN doc.text +``` + +`LIKE` can also be used in operator form: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(doc.text LIKE "foo%b_r", "text_en") + RETURN doc.text +``` + +## Geo functions + +The following functions can be accelerated by View indexes. There are +corresponding [Geo Functions](geo.md) for the regular geo index +type, but also general purpose functions such as GeoJSON constructors that can +be used in conjunction with ArangoSearch. + +### GEO_CONTAINS() + +Introduced in: v3.8.0 + +`GEO_CONTAINS(geoJsonA, geoJsonB) → bool` + +Checks whether the [GeoJSON object](geo.md#geojson) `geoJsonA` +fully contains `geoJsonB` (every point in B is also in A). + +- **geoJsonA** (object\|array): first GeoJSON object or coordinate array + (in longitude, latitude order) +- **geoJsonB** (object\|array): second GeoJSON object or coordinate array + (in longitude, latitude order) +- returns **bool** (bool): `true` when every point in B is also contained in A, + `false` otherwise + +### GEO_DISTANCE() + +Introduced in: v3.8.0 + +`GEO_DISTANCE(geoJsonA, geoJsonB) → distance` + +Return the distance between two [GeoJSON objects](geo.md#geojson), +measured from the `centroid` of each shape. + +- **geoJsonA** (object\|array): first GeoJSON object or coordinate array + (in longitude, latitude order) +- **geoJsonB** (object\|array): second GeoJSON object or coordinate array + (in longitude, latitude order) +- returns **distance** (number): the distance between the centroid points of + the two objects on the reference ellipsoid + +### GEO_IN_RANGE() + +Introduced in: v3.8.0 + +`GEO_IN_RANGE(geoJsonA, geoJsonB, low, high, includeLow, includeHigh) → bool` + +Checks whether the distance between two [GeoJSON objects](geo.md#geojson) +lies within a given interval. The distance is measured from the `centroid` of +each shape. + +- **geoJsonA** (object\|array): first GeoJSON object or coordinate array + (in longitude, latitude order) +- **geoJsonB** (object\|array): second GeoJSON object or coordinate array + (in longitude, latitude order) +- **low** (number): minimum value of the desired range +- **high** (number): maximum value of the desired range +- **includeLow** (bool, optional): whether the minimum value shall be included + in the range (left-closed interval) or not (left-open interval). The default + value is `true` +- **includeHigh** (bool): whether the maximum value shall be included in the + range (right-closed interval) or not (right-open interval). The default value + is `true` +- returns **bool** (bool): whether the evaluated distance lies within the range + +### GEO_INTERSECTS() + +Introduced in: v3.8.0 + +`GEO_INTERSECTS(geoJsonA, geoJsonB) → bool` + +Checks whether the [GeoJSON object](geo.md#geojson) `geoJsonA` +intersects with `geoJsonB` (i.e. at least one point of B is in A or vice versa). + +- **geoJsonA** (object\|array): first GeoJSON object or coordinate array + (in longitude, latitude order) +- **geoJsonB** (object\|array): second GeoJSON object or coordinate array + (in longitude, latitude order) +- returns **bool** (bool): `true` if A and B intersect, `false` otherwise + +## Scoring Functions + +Scoring functions return a ranking value for the documents found by a +[SEARCH operation](../high-level-operations/search.md). The better the documents match +the search expression the higher the returned number. + +The first argument to any scoring function is always the document emitted by +a `FOR` operation over an `arangosearch` View. + +To sort the result set by relevance, with the more relevant documents coming +first, sort in **descending order** by the score (e.g. `SORT BM25(...) DESC`). + +You may calculate custom scores based on a scoring function using document +attributes and numeric functions (e.g. `TFIDF(doc) * LOG(doc.value)`): + +```aql +FOR movie IN imdbView + SEARCH PHRASE(movie.title, "Star Wars", "text_en") + SORT BM25(movie) * LOG(movie.runtime + 1) DESC + RETURN movie +``` + +Sorting by more than one score is allowed. You may also sort by a mix of +scores and attributes from multiple Views as well as collections: + +```aql +FOR a IN viewA + FOR c IN coll + FOR b IN viewB + SORT TFIDF(b), c.name, BM25(a) + ... +``` + +### BM25() + +`BM25(doc, k, b) → score` + +Sorts documents using the +[**Best Matching 25** algorithm](https://en.wikipedia.org/wiki/Okapi_BM25) +(Okapi BM25). + +- **doc** (document): must be emitted by `FOR ... IN viewName` +- **k** (number, _optional_): calibrates the text term frequency scaling. + The value needs to be non-negative (`0.0` or higher), or the returned + score is an undefined value that may cause unpredictable results. + The default is `1.2`. A `k` value of `0` corresponds to a binary model + (no term frequency), and a large value corresponds to using raw term frequency +- **b** (number, _optional_): determines the scaling by the total text length. + The value needs to be between `0.0` and `1.0` (inclusive), or the returned + score is an undefined value that may cause unpredictable results. + The default is `0.75`. At the extreme values of the coefficient `b`, BM25 + turns into the ranking functions known as: + - BM11 for `b` = `1` (corresponds to fully scaling the term weight by the + total text length) + - BM15 for `b` = `0` (corresponds to no length normalization) +- returns **score** (number): computed ranking value + +{{< info >}} +The Analyzers used for indexing document attributes must have the `"frequency"` +feature enabled. The `BM25()` function will otherwise return a score of 0. +The Analyzers should have the `"norm"` feature enabled, too, or normalization +will be disabled, which is not meaningful for BM25 and BM11. BM15 does not need +the `"norm"` feature as it has no length normalization. +{{< /info >}} + +#### Example: Sorting by default `BM25()` score + +Sorting by relevance with BM25 at default settings: + +```aql +FOR doc IN viewName + SEARCH ... + SORT BM25(doc) DESC + RETURN doc +``` + +#### Example: Sorting with tuned `BM25()` ranking + +Sorting by relevance, with double-weighted term frequency and with full text +length normalization: + +```aql +FOR doc IN viewName + SEARCH ... + SORT BM25(doc, 2.4, 1) DESC + RETURN doc +``` + +### TFIDF() + +`TFIDF(doc, normalize) → score` + +Sorts documents using the +[**term frequency–inverse document frequency** algorithm](https://en.wikipedia.org/wiki/TF-IDF) +(TF-IDF). + +- **doc** (document): must be emitted by `FOR ... IN viewName` +- **normalize** (bool, _optional_): specifies whether scores should be + normalized. The default is `false`. +- returns **score** (number): computed ranking value + +{{< info >}} +The Analyzers used for indexing document attributes must have the `"frequency"` +feature enabled. The `TFIDF()` function will otherwise return a score of 0. +The Analyzers need to have the `"norm"` feature enabled, too, if you want to use +`TFIDF()` with the `normalize` parameter set to `true`. +{{< /info >}} + +#### Example: Sorting by default `TFIDF()` score + +Sort by relevance using the TF-IDF score: + +```aql +FOR doc IN viewName + SEARCH ... + SORT TFIDF(doc) DESC + RETURN doc +``` + +#### Example: Sorting by `TFIDF()` score with normalization + +Sort by relevance using a normalized TF-IDF score: + +```aql +FOR doc IN viewName + SEARCH ... + SORT TFIDF(doc, true) DESC + RETURN doc +``` + +#### Example: Sort by value and `TFIDF()` + +Sort by the value of the `text` attribute in ascending order, then by the TFIDF +score in descending order where the attribute values are equivalent: + +```aql +FOR doc IN viewName + SEARCH ... + SORT doc.text, TFIDF(doc) DESC + RETURN doc +``` + +## Search Highlighting Functions + +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +### OFFSET_INFO() + +`OFFSET_INFO(doc, paths) → offsetInfo` + +Returns the attribute paths and substring offsets of matched terms, phrases, or +_n_-grams for search highlighting purposes. + +- **doc** (document): must be emitted by `FOR ... IN viewName` +- **paths** (string\|array): a string or an array of strings, each describing an + attribute and array element path you want to get the offsets for. Use `.` to + access nested objects, and `[n]` with `n` being an array index to specify array + elements. The attributes need to be indexed by Analyzers with the `offset` + feature enabled. +- returns **offsetInfo** (array): an array of objects, limited to a default of + 10 offsets per path. Each object has the following attributes: + - **name** (array): the attribute and array element path as an array of + strings and numbers. You can pass this name to the + [`VALUE()` function](document-object.md) to dynamically look up the value. + - **offsets** (array): an array of arrays with the matched positions. Each + inner array has two elements with the start offset and the length of a match. + + {{< warning >}} + The offsets describe the positions in bytes, not characters. You may need + to account for characters encoded using multiple bytes. + {{< /warning >}} + +--- + +`OFFSET_INFO(doc, rules) → offsetInfo` + +- **doc** (document): must be emitted by `FOR ... IN viewName` +- **rules** (array): an array of objects with the following attributes: + - **name** (string): an attribute and array element path + you want to get the offsets for. Use `.` to access nested objects, + and `[n]` with `n` being an array index to specify array elements. The + attributes need to be indexed by Analyzers with the `offset` feature enabled. + - **options** (object): an object with the following attributes: + - **maxOffsets** (number, _optional_): the total number of offsets to + collect per path. Default: `10`. + - **limits** (object, _optional_): an object with the following attributes: + - **term** (number, _optional_): the total number of term offsets to + collect per path. Default: 232. + - **phrase** (number, _optional_): the total number of phrase offsets to + collect per path. Default: 232. + - **ngram** (number, _optional_): the total number of _n_-gram offsets to + collect per path. Default: 232. +- returns **offsetInfo** (array): an array of objects, each with the following + attributes: + - **name** (array): the attribute and array element path as an array of + strings and numbers. You can pass this name to the + [`VALUE()`](document-object.md) to dynamically look up the value. + - **offsets** (array): an array of arrays with the matched positions, capped + to the specified limits. Each inner array has two elements with the start + offset and the length of a match. + + {{< warning >}} + The start offsets and lengths describe the positions in bytes, not characters. + You may need to account for characters encoded using multiple bytes. + {{< /warning >}} + +**Examples** + +Search a View and get the offset information for the matches: + +```js +--- +name: aqlOffsetInfo +description: '' +--- +~db._create("food"); +~db.food.save({ name: "avocado", description: { en: "The avocado is a medium-sized, evergreen tree, native to the Americas." } }); +~db.food.save({ name: "tomato", description: { en: "The tomato is the edible berry of the tomato plant." } }); +~var analyzers = require("@arangodb/analyzers"); +~var analyzer = analyzers.save("text_en_offset", "text", { locale: "en", stopwords: [] }, ["frequency", "norm", "position", "offset"]); +~db._createView("food_view", "arangosearch", { links: { food: { fields: { description: { fields: { en: { analyzers: ["text_en_offset"] } } } } } } }); +~assert(db._query(`FOR d IN food_view COLLECT WITH COUNT INTO c RETURN c`).toArray()[0] === 2); +db._query(` + FOR doc IN food_view + SEARCH ANALYZER(TOKENS("avocado tomato", "text_en_offset") ANY == doc.description.en, "text_en_offset") + RETURN OFFSET_INFO(doc, ["description.en"])`); +~db._dropView("food_view"); +~db._drop("food"); +~analyzers.remove(analyzer.name); +``` + +For full examples, see [Search Highlighting](../../index-and-search/arangosearch/search-highlighting.md). diff --git a/site/content/arangodb/oem/aql/functions/array.md b/site/content/arangodb/oem/aql/functions/array.md new file mode 100644 index 0000000000..74cfa4e788 --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/array.md @@ -0,0 +1,1041 @@ +--- +title: Array functions in AQL +menuTitle: Array +weight: 10 +description: >- + AQL provides functions for higher-level array manipulation in addition to + language constructs that can also be used for arrays +--- +You can use the AQL functions listed below to work with lists of items. Also +see the [numeric functions](numeric.md) for functions that work on number arrays. + +If you want to concatenate the elements of an array equivalent to `join()` +in JavaScript, see [`CONCAT()`](string.md#concat) and +[`CONCAT_SEPARATOR()`](string.md#concat_separator) in the string functions chapter. + +Apart from that, AQL also offers several language constructs: + +- simple [array access](../fundamentals/data-types.md#arrays--lists) of individual elements, +- [array operators](../operators.md#array-operators) for array expansion and contraction, + optionally with inline filter, limit and projection, +- [array comparison operators](../operators.md#array-comparison-operators) to compare + each element in an array to a value or the elements of another array, +- loop-based operations on arrays using [FOR](../high-level-operations/for.md), + [SORT](../high-level-operations/sort.md), + [LIMIT](../high-level-operations/limit.md), + as well as [COLLECT](../high-level-operations/collect.md) for grouping, + which also offers efficient aggregation. + +## APPEND() + +`APPEND(anyArray, values, unique) → newArray` + +Add all elements of an array to another array. All values are added at the end of the +array (right side). + +It can also be used to append a single element to an array. It is not necessary to wrap +it in an array (unless it is an array itself). You may also use [`PUSH()`](#push) instead. + +- **anyArray** (array): array with elements of arbitrary type +- **values** (array\|any): array, whose elements shall be added to `anyArray` +- **unique** (bool, *optional*): if set to `true`, all duplicate values are + removed from the resulting array. If `values` is an empty array or if either + `anyArray` or `values` is `null`, then the other input array is returned + unmodified. The default is `false`. +- returns **newArray** (array): the modified array + +**Examples** + +```aql +--- +name: aqlArrayAppend_1 +description: '' +--- +RETURN APPEND([ 1, 2, 3 ], [ 5, 6, 9 ]) +``` + +```aql +--- +name: aqlArrayAppend_2 +description: '' +--- +RETURN APPEND([ 1, 2, 3 ], [ 3, 4, 5, 2, 9 ], true) +``` + +## CONTAINS_ARRAY() + +This is an alias for [`POSITION()`](#position). + +## COUNT() + +This is an alias for [`LENGTH()`](#length). + +## COUNT_DISTINCT() + +`COUNT_DISTINCT(anyArray) → number` + +Get the number of distinct elements in an array. + +- **anyArray** (array): array with elements of arbitrary type +- returns **number**: the number of distinct elements in *anyArray*. + +**Examples** + +```aql +--- +name: aqlArrayCountDistinct_1 +description: '' +--- +RETURN COUNT_DISTINCT([ 1, 2, 3 ]) +``` + +```aql +--- +name: aqlArrayCountDistinct_2 +description: '' +--- +RETURN COUNT_DISTINCT([ "yes", "no", "yes", "sauron", "no", "yes" ]) +``` + +## COUNT_UNIQUE() + +This is an alias for [`COUNT_DISTINCT()`](#count_distinct). + +## FIRST() + +`FIRST(anyArray) → firstElement` + +Get the first element of an array. It is the same as `anyArray[0]`. + +- **anyArray** (array): array with elements of arbitrary type +- returns **firstElement** (any\|null): the first element of *anyArray*, or *null* if + the array is empty. + +**Examples** + +```aql +--- +name: aqlArrayFirst_1 +description: '' +--- +RETURN FIRST([ 1, 2, 3 ]) +``` + +```aql +--- +name: aqlArrayFirst_2 +description: '' +--- +RETURN FIRST([]) +``` + +## FLATTEN() + +`FLATTEN(anyArray, depth) → flatArray` + +Turn an array of arrays into a flat array. All array elements in *array* will be +expanded in the result array. Non-array elements are added as they are. The function +will recurse into sub-arrays up to the specified depth. Duplicates will not be removed. + +Also see [array contraction](../operators.md#array-contraction). + +- **array** (array): array with elements of arbitrary type, including nested arrays +- **depth** (number, *optional*): flatten up to this many levels, the default is 1 +- returns **flatArray** (array): a flattened array + +**Examples** + +```aql +--- +name: aqlArrayFlatten_1 +description: '' +--- +RETURN FLATTEN( [ 1, 2, [ 3, 4 ], 5, [ 6, 7 ], [ 8, [ 9, 10 ] ] ] ) +``` + +To fully flatten the example array, use a *depth* of 2: + +```aql +--- +name: aqlArrayFlatten_2 +description: '' +--- +RETURN FLATTEN( [ 1, 2, [ 3, 4 ], 5, [ 6, 7 ], [ 8, [ 9, 10 ] ] ], 2 ) +``` + +## INTERLEAVE() + +`INTERLEAVE(array1, array2, ... arrayN) → newArray` + +Accepts an arbitrary number of arrays and produces a new array with the elements +interleaved. It iterates over the input arrays in a round robin fashion, picks one element +from each array per iteration, and combines them in that sequence into a result array. +The input arrays can have different amounts of elements. + +- **arrays** (array, *repeatable*): an arbitrary number of arrays as multiple + arguments (at least 2) +- returns **newArray** (array): the interleaved array + +**Examples** + +```aql +--- +name: aqlArrayInterleave_1 +description: '' +--- +RETURN INTERLEAVE( [1, 1, 1], [2, 2, 2], [3, 3, 3] ) +``` + +```aql +--- +name: aqlArrayInterleave_2 +description: '' +--- +RETURN INTERLEAVE( [ 1 ], [2, 2], [3, 3, 3] ) +``` + +```aql +--- +name: aqlArrayInterleave_3 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 1..3 OUTBOUND 'places/Toronto' GRAPH 'kShortestPathsGraph' + RETURN INTERLEAVE(p.vertices[*]._id, p.edges[*]._id) +``` + +## INTERSECTION() + +`INTERSECTION(array1, array2, ... arrayN) → newArray` + +Return the intersection of all arrays specified. The result is an array of values that +occur in all arguments. + +Other set operations are [`UNION()`](#union), [`MINUS()`](#minus) and +[`OUTERSECTION()`](#outersection). + +- **arrays** (array, *repeatable*): an arbitrary number of arrays as multiple arguments + (at least 2) +- returns **newArray** (array): a single array with only the elements, which exist in all + provided arrays. The element order is random. Duplicates are removed. + +**Examples** + +```aql +--- +name: aqlArrayIntersection_1 +description: '' +--- +RETURN INTERSECTION( [1,2,3,4,5], [2,3,4,5,6], [3,4,5,6,7] ) +``` + +```aql +--- +name: aqlArrayIntersection_2 +description: '' +--- +RETURN INTERSECTION( [2,4,6], [8,10,12], [14,16,18] ) +``` + +## JACCARD() + +`JACCARD(array1, array2) → jaccardIndex` + +Calculate the [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) +of two arrays. + +This similarity measure is also known as _Intersection over Union_ and could +be computed (less efficient and more verbose) as follows: + +```aql +COUNT(a) == 0 && COUNT(b) == 0 +? 1 // two empty sets have a similarity of 1 by definition +: COUNT(INTERSECTION(array1, array2)) / COUNT(UNION_DISTINCT(array1, array2)) +``` + +- **array1** (array): array with elements of arbitrary type +- **array2** (array): array with elements of arbitrary type +- returns **jaccardIndex** (number): calculated Jaccard index of the input + arrays *array1* and *array2* + +```aql +--- +name: aqlArrayJaccard_1 +description: '' +--- +RETURN JACCARD( [1,2,3,4], [3,4,5,6] ) +``` + +```aql +--- +name: aqlArrayJaccard_2 +description: '' +--- +RETURN JACCARD( [1,1,2,2,2,3], [2,2,3,4] ) +``` + +```aql +--- +name: aqlArrayJaccard_3 +description: '' +--- +RETURN JACCARD( [1,2,3], [] ) +``` + +```aql +--- +name: aqlArrayJaccard_4 +description: '' +--- +RETURN JACCARD( [], [] ) +``` + +## LAST() + +`LAST(anyArray) → lastElement` + +Get the last element of an array. It is the same as `anyArray[-1]`. + +- **anyArray** (array): array with elements of arbitrary type +- returns **lastElement** (any\|null): the last element of *anyArray* or *null* if the + array is empty. + +**Example** + +```aql +--- +name: aqlArrayLast_1 +description: '' +--- +RETURN LAST( [1,2,3,4,5] ) +``` + +## LENGTH() + +`LENGTH(anyArray) → length` + +Determine the number of elements in an array. + +- **anyArray** (array): array with elements of arbitrary type +- returns **length** (number): the number of array elements in *anyArray*. + +`LENGTH()` can also determine the [number of attribute keys](document-object.md#length) +of an object / document, the [amount of documents](miscellaneous.md#length) in a +collection and the [character length](string.md#length) of a string. + +| Input | Length | +|--------|--------| +| String | Number of Unicode characters | +| Number | Number of Unicode characters that represent the number | +| Array | Number of elements | +| Object | Number of first level elements | +| true | 1 | +| false | 0 | +| null | 0 | + +**Examples** + +```aql +--- +name: aqlArrayLength_1 +description: '' +--- +RETURN LENGTH( "🥑" ) +``` + +```aql +--- +name: aqlArrayLength_2 +description: '' +--- +RETURN LENGTH( 1234 ) +``` + +```aql +--- +name: aqlArrayLength_3 +description: '' +--- +RETURN LENGTH( [1,2,3,4,5,6,7] ) +``` + +```aql +--- +name: aqlArrayLength_4 +description: '' +--- +RETURN LENGTH( false ) +``` + +```aql +--- +name: aqlArrayLength_5 +description: '' +--- +RETURN LENGTH( {a:1, b:2, c:3, d:4, e:{f:5,g:6}} ) +``` + +## MINUS() + +`MINUS(array1, array2, ... arrayN) → newArray` + +Return the difference of all arrays specified. + +Other set operations are [`UNION()`](#union), [`INTERSECTION()`](#intersection) +and [`OUTERSECTION()`](#outersection). + +- **arrays** (array, *repeatable*): an arbitrary number of arrays as multiple + arguments (at least 2) +- returns **newArray** (array): an array of values that occur in the first array, + but not in any of the subsequent arrays. The order of the result array is undefined + and should not be relied on. Duplicates will be removed. + +**Example** + +```aql +--- +name: aqlArrayMinus_1 +description: '' +--- +RETURN MINUS( [1,2,3,4], [3,4,5,6], [5,6,7,8] ) +``` + +## NTH() + +`NTH(anyArray, position) → nthElement` + +Get the element of an array at a given position. It is the same as `anyArray[position]` +for positive positions, but does not support negative positions. + +- **anyArray** (array): array with elements of arbitrary type +- **position** (number): position of desired element in array, positions start at 0 +- returns **nthElement** (any\|null): the array element at the given *position*. + If *position* is negative or beyond the upper bound of the array, + then *null* will be returned. + +**Examples** + +```aql +--- +name: aqlArrayNth_1 +description: '' +--- +RETURN NTH( [ "foo", "bar", "baz" ], 2 ) +``` + +```aql +--- +name: aqlArrayNth_2 +description: '' +--- +RETURN NTH( [ "foo", "bar", "baz" ], 3 ) +``` + +```aql +--- +name: aqlArrayNth_3 +description: '' +--- +RETURN NTH( [ "foo", "bar", "baz" ], -1 ) +``` + +## OUTERSECTION() + +`OUTERSECTION(array1, array2, ... arrayN) → newArray` + +Return the values that occur only once across all arrays specified. + +Other set operations are [`UNION()`](#union), [`MINUS()`](#minus) and +[`INTERSECTION()`](#intersection). + +- **arrays** (array, *repeatable*): an arbitrary number of arrays as multiple arguments + (at least 2) +- returns **newArray** (array): a single array with only the elements that exist only once + across all provided arrays. The element order is random. + +**Example** + +```aql +--- +name: aqlArrayOutersection_1 +description: '' +--- +RETURN OUTERSECTION( [ 1, 2, 3 ], [ 2, 3, 4 ], [ 3, 4, 5 ] ) +``` + +## POP() + +`POP(anyArray) → newArray` + +Remove the last element of *array*. + +To append an element (right side), see [`PUSH()`](#push).\ +To remove the first element, see [`SHIFT()`](#shift).\ +To remove an element at an arbitrary position, see [`REMOVE_NTH()`](#remove_nth). + +- **anyArray** (array): an array with elements of arbitrary type +- returns **newArray** (array): *anyArray* without the last element. If it's already + empty or has only a single element left, an empty array is returned. + +**Examples** + +```aql +--- +name: aqlArrayPop_1 +description: '' +--- +RETURN POP( [ 1, 2, 3, 4 ] ) +``` + +```aql +--- +name: aqlArrayPop_2 +description: '' +--- +RETURN POP( [ 1 ] ) +``` + +## POSITION() + +`POSITION(anyArray, search, returnIndex) → position` + +Return whether *search* is contained in *array*. Optionally return the position. + +- **anyArray** (array): the haystack, an array with elements of arbitrary type +- **search** (any): the needle, an element of arbitrary type +- **returnIndex** (bool, *optional*): if set to *true*, the position of the match + is returned instead of a boolean. The default is *false*. +- returns **position** (bool\|number): *true* if *search* is contained in *anyArray*, + *false* otherwise. If *returnIndex* is enabled, the position of the match is + returned (positions start at 0), or *-1* if it's not found. + +If you want to check if a value is in an array, you can alternatively use +the [`IN` operator](../operators.md#comparison-operators), for example, +`3 IN [1, 2, 3]` instead of `POSITION([1, 2, 3], 3)`. + +To determine if or at which position a string occurs in another string, see the +[`CONTAINS()` string function](string.md#contains). + +**Examples** + +Test whether a value is contained in an array: + +```aql +--- +name: aqlArrayPosition_1 +description: '' +--- +RETURN POSITION( [2,4,6,8], 4 ) +``` + +Return the position of the match, i.e. the array index, or `-1` if the value is +not contained in the array: + +```aql +--- +name: aqlArrayPosition_2 +description: '' +--- +RETURN POSITION( [2,4,6,8], 4, true ) +``` + +If you want to search a list of objects, you can use the +[array expansion operator `[*]`](../operators.md#array-expansion). +For example, you can get an attribute from each object using the operator, and +then determine the array index of the first match using the `POSITION()` function: + +```aql +--- +name: aqlArrayPosition_3 +description: '' +--- +LET arr = [ { value: "foo" }, { value: "bar" }, { value: "baz" }, { value: "bay"} ] +RETURN POSITION(arr[*].value, "baz", true) +``` + +If you are not interested in the actual position but only want to check for +existence, you may use the `IN` operator instead of calling `POSITION()`, like +`"baz" IN arr[*].value`. + +## PUSH() + +`PUSH(anyArray, value, unique) → newArray` + +Append *value* to *anyArray* (right side). + +To remove the last element, see [`POP()`](#pop).\ +To prepend a value (left side), see [`UNSHIFT()`](#unshift).\ +To append multiple elements, see [`APPEND()`](#append). + +- **anyArray** (array): array with elements of arbitrary type +- **value** (any): an element of arbitrary type +- **unique** (bool): if set to *true*, then *value* is not added if already + present in the array. The default is *false*. +- returns **newArray** (array): *anyArray* with *value* added at the end + (right side) + +Note: The *unique* flag only controls if *value* is added if it's already present +in *anyArray*. Duplicate elements that already exist in *anyArray* will not be +removed. To make an array unique, use the [`UNIQUE()`](#unique) function. + +**Examples** + +```aql +--- +name: aqlArrayPush_1 +description: '' +--- +RETURN PUSH([ 1, 2, 3 ], 4) +``` + +```aql +--- +name: aqlArrayPush_2 +description: '' +--- +RETURN PUSH([ 1, 2, 2, 3 ], 2, true) +``` + +## REMOVE_NTH() + +`REMOVE_NTH(anyArray, position) → newArray` + +Remove the element at *position* from the *anyArray*. + +To remove the first element, see [`SHIFT()`](#shift).\ +To remove the last element, see [`POP()`](#pop). + +- **anyArray** (array): array with elements of arbitrary type +- **position** (number): the position of the element to remove. Positions start + at 0. Negative positions are supported, with -1 being the last array element. + If *position* is out of bounds, the array is returned unmodified. +- returns **newArray** (array): *anyArray* without the element at *position* + +**Examples** + +```aql +--- +name: aqlArrayRemoveNth_1 +description: '' +--- +RETURN REMOVE_NTH( [ "a", "b", "c", "d", "e" ], 1 ) +``` + +```aql +--- +name: aqlArrayRemoveNth_2 +description: '' +--- +RETURN REMOVE_NTH( [ "a", "b", "c", "d", "e" ], -2 ) +``` + +## REPLACE_NTH() + +`REPLACE_NTH(anyArray, position, replaceValue, defaultPaddingValue) → newArray` + +Replace the element at *position* in *anyArray* with *replaceValue*. + +- **anyArray** (array): array with elements of arbitrary type +- **position** (number): the position of the element to replace. Positions start + at 0. Negative positions are supported, with -1 being the last array element. + If a negative *position* is out of bounds, then it is set to the first element (0) +- **replaceValue** the value to be inserted at *position* +- **defaultPaddingValue** to be used for padding if *position* is two or more + elements beyond the last element in *anyArray* +- returns **newArray** (array): *anyArray* with the element at *position* + replaced by *replaceValue*, or appended to *anyArray* and possibly padded by + *defaultPaddingValue* + +It is allowed to specify a position beyond the upper array boundary: +- *replaceValue* is appended if *position* is equal to the array length +- if it is higher, *defaultPaddingValue* is appended to *anyArray* as many + times as needed to place *replaceValue* at *position* +- if no *defaultPaddingValue* is supplied in above case, then a query error + is raised + +**Examples** + +```aql +--- +name: aqlArrayReplaceNth_1 +description: '' +--- +RETURN REPLACE_NTH( [ "a", "b", "c" ], 1 , "z") +``` + +```aql +--- +name: aqlArrayReplaceNth_2 +description: '' +--- +RETURN REPLACE_NTH( [ "a", "b", "c" ], 3 , "z") +``` + +```aql +--- +name: aqlArrayReplaceNth_4 +description: '' +--- +RETURN REPLACE_NTH( [ "a", "b", "c" ], 6, "z", "y" ) +``` + +```aql +--- +name: aqlArrayReplaceNth_5 +description: '' +--- +RETURN REPLACE_NTH( [ "a", "b", "c" ], -1, "z" ) +``` + +```aql +--- +name: aqlArrayReplaceNth_6 +description: '' +--- +RETURN REPLACE_NTH( [ "a", "b", "c" ], -9, "z" ) +``` + +Trying to access out of bounds, without providing a padding value will result in an error: + +```js +--- +name: aqlArrayReplaceNth_3 +description: '' +--- +db._query('RETURN REPLACE_NTH( [ "a", "b", "c" ], 6 , "z")'); // xpError(ERROR_QUERY_FUNCTION_ARGUMENT_TYPE_MISMATCH) +``` + +## REMOVE_VALUE() + +`REMOVE_VALUE(anyArray, value, limit) → newArray` + +Remove all occurrences of *value* in *anyArray*. Optionally with a *limit* +to the number of removals. + +- **anyArray** (array): array with elements of arbitrary type +- **value** (any): an element of arbitrary type +- **limit** (number, *optional*): cap the number of removals to this value +- returns **newArray** (array): *anyArray* with *value* removed + +**Examples** + +```aql +--- +name: aqlArrayRemoveValue_1 +description: '' +--- +RETURN REMOVE_VALUE( [ "a", "b", "b", "a", "c" ], "a" ) +``` + +```aql +--- +name: aqlArrayRemoveValue_2 +description: '' +--- +RETURN REMOVE_VALUE( [ "a", "b", "b", "a", "c" ], "a", 1 ) +``` + +## REMOVE_VALUES() + +`REMOVE_VALUES(anyArray, values) → newArray` + +Remove all occurrences of any of the *values* from *anyArray*. + +- **anyArray** (array): array with elements of arbitrary type +- **values** (array): an array with elements of arbitrary type, that shall + be removed from *anyArray* +- returns **newArray** (array): *anyArray* with all individual *values* removed + +**Example** + +```aql +--- +name: aqlArrayRemoveValues_1 +description: '' +--- +RETURN REMOVE_VALUES( [ "a", "a", "b", "c", "d", "e", "f" ], [ "a", "f", "d" ] ) +``` + +## REVERSE() + +`REVERSE(anyArray) → reversedArray` + +Return an array with its elements reversed. + +- **anyArray** (array): array with elements of arbitrary type +- returns **reversedArray** (array): a new array with all elements of *anyArray* in + reversed order + +**Example** + +```aql +--- +name: aqlArrayReverse_1 +description: '' +--- +RETURN REVERSE ( [2,4,6,8,10] ) +``` + +## SHIFT() + +`SHIFT(anyArray) → newArray` + +Remove the first element of *anyArray*. + +To prepend an element (left side), see [`UNSHIFT()`](#unshift).\ +To remove the last element, see [`POP()`](#pop).\ +To remove an element at an arbitrary position, see [`REMOVE_NTH()`](#remove_nth). + +- **anyArray** (array): array with elements with arbitrary type +- returns **newArray** (array): *anyArray* without the left-most element. If *anyArray* + is already empty or has only one element left, an empty array is returned. + +**Examples** + +```aql +--- +name: aqlArrayShift_1 +description: '' +--- +RETURN SHIFT( [ 1, 2, 3, 4 ] ) +``` + +```aql +--- +name: aqlArrayShift_2 +description: '' +--- +RETURN SHIFT( [ 1 ] ) +``` + +## SLICE() + +`SLICE(anyArray, start, length) → newArray` + +Extract a slice of *anyArray*. + +- **anyArray** (array): array with elements of arbitrary type +- **start** (number): start extraction at this element. Positions start at 0. + Negative values indicate positions from the end of the array. +- **length** (number, *optional*): extract up to *length* elements, or all + elements from *start* up to *length* if negative (exclusive) +- returns **newArray** (array): the specified slice of *anyArray*. If *length* + is not specified, all array elements starting at *start* will be returned. + +**Examples** + +```aql +--- +name: aqlArraySlice_1 +description: '' +--- +RETURN SLICE( [ 1, 2, 3, 4, 5 ], 0, 1 ) +``` + +```aql +--- +name: aqlArraySlice_2 +description: '' +--- +RETURN SLICE( [ 1, 2, 3, 4, 5 ], 1, 2 ) +``` + +```aql +--- +name: aqlArraySlice_3 +description: '' +--- +RETURN SLICE( [ 1, 2, 3, 4, 5 ], 3 ) +``` + +```aql +--- +name: aqlArraySlice_4 +description: '' +--- +RETURN SLICE( [ 1, 2, 3, 4, 5 ], 1, -1 ) +``` + +```aql +--- +name: aqlArraySlice_5 +description: '' +--- +RETURN SLICE( [ 1, 2, 3, 4, 5 ], 0, -2 ) +``` + +```aql +--- +name: aqlArraySlice_6 +description: '' +--- +RETURN SLICE( [ 1, 2, 3, 4, 5 ], -3, 2 ) +``` + +## SORTED() + +`SORTED(anyArray) → newArray` + +Sort all elements in *anyArray*. The function will use the default comparison +order for AQL value types. + +- **anyArray** (array): array with elements of arbitrary type +- returns **newArray** (array): *anyArray*, with elements sorted + +**Example** + +```aql +--- +name: aqlArraySorted_1 +description: '' +--- +RETURN SORTED( [ 8,4,2,10,6 ] ) +``` + +## SORTED_UNIQUE() + +`SORTED_UNIQUE(anyArray) → newArray` + +Sort all elements in *anyArray*. The function will use the default comparison +order for AQL value types. Additionally, the values in the result array will +be made unique. + +- **anyArray** (array): array with elements of arbitrary type +- returns **newArray** (array): *anyArray*, with elements sorted and duplicates + removed + +**Example** + +```aql +--- +name: aqlArraySortedUnique_1 +description: '' +--- +RETURN SORTED_UNIQUE( [ 8,4,2,10,6,2,8,6,4 ] ) +``` + +## UNION() + +`UNION(array1, array2, ... arrayN) → newArray` + +Return the union of all arrays specified. + +Other set operations are [`MINUS()`](#minus), [`INTERSECTION()`](#intersection) +and [`OUTERSECTION()`](#outersection). + +- **arrays** (array, *repeatable*): an arbitrary number of arrays as multiple + arguments (at least 2) +- returns **newArray** (array): all array elements combined in a single array, + in any order + +**Examples** + +```aql +--- +name: aqlArrayUnion_1 +description: '' +--- +RETURN UNION( + [ 1, 2, 3 ], + [ 1, 2 ] +) +``` + +Note: No duplicates will be removed. In order to remove duplicates, please use +either [`UNION_DISTINCT()`](#union_distinct) or apply [`UNIQUE()`](#unique) on the +result of `UNION()`: + +```aql +--- +name: aqlArrayUnion_2 +description: '' +--- +RETURN UNIQUE( + UNION( + [ 1, 2, 3 ], + [ 1, 2 ] + ) +) +``` + +## UNION_DISTINCT() + +`UNION_DISTINCT(array1, array2, ... arrayN) → newArray` + +Return the union of distinct values of all arrays specified. + +- **arrays** (array, *repeatable*): an arbitrary number of arrays as multiple + arguments (at least 2) +- returns **newArray** (array): the elements of all given arrays in a single + array, without duplicates, in any order + +**Example** + +```aql +--- +name: aqlArrayUnionDistinct_1 +description: '' +--- +RETURN UNION_DISTINCT( + [ 1, 2, 3 ], + [ 1, 2 ] +) +``` + +## UNIQUE() + +`UNIQUE(anyArray) → newArray` + +Return all unique elements in *anyArray*. To determine uniqueness, the +function will use the comparison order. + +- **anyArray** (array): array with elements of arbitrary type +- returns **newArray** (array): *anyArray* without duplicates, in any order + +**Example** + +```aql +--- +name: aqlArrayUnique_1 +description: '' +--- +RETURN UNIQUE( [ 1,2,2,3,3,3,4,4,4,4,5,5,5,5,5 ] ) +``` + +## UNSHIFT() + +`UNSHIFT(anyArray, value, unique) → newArray` + +Prepend *value* to *anyArray* (left side). + +To remove the first element, see [`SHIFT()`](#shift).\ +To append a value (right side), see [`PUSH()`](#push). + +- **anyArray** (array): array with elements of arbitrary type +- **value** (any): an element of arbitrary type +- **unique** (bool): if set to *true*, then *value* is not added if already + present in the array. The default is *false*. +- returns **newArray** (array): *anyArray* with *value* added at the start + (left side) + +Note: The *unique* flag only controls if *value* is added if it's already present +in *anyArray*. Duplicate elements that already exist in *anyArray* will not be +removed. To make an array unique, use the [`UNIQUE()`](#unique) function. + +**Examples** + +```aql +--- +name: aqlArrayUnshift_1 +description: '' +--- +RETURN UNSHIFT( [ 1, 2, 3 ], 4 ) +``` + +```aql +--- +name: aqlArrayUnshift_2 +description: '' +--- +RETURN UNSHIFT( [ 1, 2, 3 ], 2, true ) +``` diff --git a/site/content/arangodb/oem/aql/functions/bit.md b/site/content/arangodb/oem/aql/functions/bit.md new file mode 100644 index 0000000000..bca40a82f6 --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/bit.md @@ -0,0 +1,321 @@ +--- +title: Bit functions in AQL +menuTitle: Bit +weight: 15 +description: >- + AQL offers a set of bit manipulation and interpretation functions for bitwise + arithmetic +--- +Bit functions can operate on numeric integer values in the range between 0 +and 4294967295 (232 - 1), both included. This allows treating numbers as +bitsets of up to 32 members. Using any of the bit functions on numbers outside +the supported range will make the function return `null` and register a warning. + +The value range for the bit functions is conservatively small, so that no +precision loss or rounding errors should occur when the input/output values of +bit functions are passed around or sent over the wire to client applications +with unknown precision number types. + +## BIT_AND() + +`BIT_AND(numbersArray) → result` + +And-combines the numeric values in *numbersArray* into a single numeric result +value. + +- **numbersArray** (array): array with numeric input values +- returns **result** (number\|null): and-combined result + +The function expects an array with numeric values as its input. The values in +the array must be numbers, which must not be negative. The maximum supported +input number value is 232 - 1. Input number values outside the allowed +range will make the function return `null` and produce a warning. Any `null` +values in the input array are ignored. + +--- + +`BIT_AND(value1, value2) → result` + +If two numbers are passed as individual function parameters to `BIT_AND()`, it +will return the bitwise and value of its two operands. Only numbers in the +range 0 to 232 - 1 are allowed as input values. + +- **value1** (number): first operand +- **value2** (number): second operand +- returns **result** (number\|null): and-combined result + +```aql +BIT_AND([1, 4, 8, 16]) // 0 +BIT_AND([3, 7, 63]) // 3 +BIT_AND([255, 127, null, 63]) // 63 +BIT_AND(127, 255) // 127 +BIT_AND("foo") // null +``` + +## BIT_CONSTRUCT() + +`BIT_CONSTRUCT(positionsArray) → result` + +Construct a number value with its bits set at the positions given in the array. + +- **positionArray** (array): array with bit positions to set (zero-based) +- returns **result** (number\|null): the generated number + +The function expects an array with numeric values as its input. The values in +the array must be numbers, which must not be negative. The maximum supported +input number value is 31. Input number values outside the allowed range will +make the function return `null` and produce a warning. + +```aql +BIT_CONSTRUCT([1, 2, 3]) // 14 +BIT_CONSTRUCT([0, 4, 8]) // 273 +BIT_CONSTRUCT([0, 1, 10, 31]) // 2147484675 +``` + +## BIT_DECONSTRUCT() + +`BIT_DECONSTRUCT(number) → positionsArray` + +Deconstruct a number value into an array with the positions of its set bits. + +- **number** (number): the input value to deconstruct +- returns **positionArray** (array\|null): array with bit positions set (zero-based) + +The function turns a numeric value into an array with the positions of all its +set bits. The positions in the output array are zero-based. +The input value must be a number between 0 and 232 - 1 (including). +The function will return `null` for any other inputs and produce a warning. + +```aql +BIT_DECONSTRUCT(14) // [1, 2, 3] +BIT_DECONSTRUCT(273) // [0, 4, 8] +BIT_DECONSTRUCT(2147484675) // [0, 1, 10, 31] +``` + +## BIT_FROM_STRING() + +`BIT_FROM_STRING(bitstring) → number` + +Converts a bitstring (consisting of digits `0` and `1`) into a number. + +To convert a number into a bitstring, see [`BIT_TO_STRING()`](#bit_to_string). + +- **bitstring** (string): string sequence consisting of `0` and `1` characters +- returns **number** (number\|null): the parsed number + +The input value must be a bitstring, consisting only of `0` and `1` characters. +The bitstring can contain up to 32 significant bits, including any leading zeros. +Note that the bitstring must not start with `0b`. +If the bitstring has an invalid format, this function returns `null` and produces +a warning. + +```aql +BIT_FROM_STRING("0111") // 7 +BIT_FROM_STRING("000000000000010") // 2 +BIT_FROM_STRING("11010111011101") // 13789 +BIT_FROM_STRING("100000000000000000000") // 1048756 +``` + +## BIT_NEGATE() + +`BIT_NEGATE(number, bits) → result` + +Bitwise-negates the bits in **number**, and keeps up to **bits** bits in the +result. + +- **number** (number): the number to negate +- **bits** (number): number of bits to keep in the result (0 to 32) +- returns **result** (number\|null): the resulting number, with up to **bits** + significant bits + +The input value must be a number between 0 and 232 - 1 (including). +The number of bits must be between 0 and 32. The function will return `null` for +any other inputs and produce a warning. + +```aql +BIT_NEGATE(0, 8) // 255 +BIT_NEGATE(0, 10) // 1023 +BIT_NEGATE(3, 4) // 12 +BIT_NEGATE(446359921, 32) // 3848607374 +``` + +## BIT_OR() + +`BIT_OR(numbersArray) → result` + +Or-combines the numeric values in *numbersArray* into a single numeric result +value. + +- **numbersArray** (array): array with numeric input values +- returns **result** (number\|null): or-combined result + +The function expects an array with numeric values as its input. The values in +the array must be numbers, which must not be negative. The maximum supported +input number value is 232 - 1. Input number values outside the +allowed range will make the function return `null` and produce a warning. +Any `null` values in the input array are ignored. + +--- + +`BIT_OR(value1, value2) → result` + +If two numbers are passed as individual function parameters to `BIT_OR()`, it +will return the bitwise or value of its two operands. Only numbers in the range +0 to 232 - 1 are allowed as input values. + +- **value1** (number): first operand +- **value2** (number): second operand +- returns **result** (number\|null): or-combined result + +```aql +BIT_OR([1, 4, 8, 16]) // 29 +BIT_OR([3, 7, 63]) // 63 +BIT_OR([255, 127, null, 63]) // 255 +BIT_OR(255, 127) // 255 +BIT_OR("foo") // null +``` + +## BIT_POPCOUNT() + +`BIT_POPCOUNT(number) → result` + +Counts the number of bits set in the input value. + +- **number** (number): array with numeric input values +- returns **result** (number\|null): number of bits set in the input value + +The input value must be a number between 0 and 232 - 1 (including). +The function will return `null` for any other inputs and produce a warning. + +```aql +BIT_POPCOUNT(0) // 0 +BIT_POPCOUNT(255) // 8 +BIT_POPCOUNT(69399252) // 12 +BIT_POPCOUNT("foo") // null +``` + +## BIT_SHIFT_LEFT() + +`BIT_SHIFT_LEFT(number, shift, bits) → result` + +Bitwise-shifts the bits in **number** to the left, and keeps up to **bits** +bits in the result. When bits overflow due to the shift, they are discarded. + +- **number** (number): the number to shift +- **shift** (number): number of bits to shift (0 to 32) +- **bits** (number): number of bits to keep in the result (0 to 32) +- returns **result** (number\|null): the resulting number, with up to **bits** + significant bits + +The input value must be a number between 0 and 232 - 1 (including). +The number of bits must be between 0 and 32. The function will return `null` for +any other inputs and produce a warning. + +```aql +BIT_SHIFT_LEFT(0, 1, 8) // 0 +BIT_SHIFT_LEFT(7, 1, 16) // 14 +BIT_SHIFT_LEFT(2, 10, 16) // 2048 +BIT_SHIFT_LEFT(878836, 16, 32) // 1760821248 +``` + +## BIT_SHIFT_RIGHT() + +`BIT_SHIFT_RIGHT(number, shift, bits) → result` + +Bitwise-shifts the bits in **number** to the right, and keeps up to **bits** +bits in the result. When bits overflow due to the shift, they are discarded. + +- **number** (number): the number to shift +- **shift** (number): number of bits to shift (0 to 32) +- **bits** (number): number of bits to keep in the result (0 to 32) +- returns **result** (number\|null): the resulting number, with up to **bits** + significant bits + +The input value must be a number between 0 and 232 - 1 (including). +The number of bits must be between 0 and 32. The function will return `null` for +any other inputs and produce a warning. + +```aql +BIT_SHIFT_RIGHT(0, 1, 8) // 0 +BIT_SHIFT_RIGHT(33, 1, 16) // 16 +BIT_SHIFT_RIGHT(65536, 13, 16) // 8 +BIT_SHIFT_RIGHT(878836, 4, 32) // 54927 +``` + +## BIT_TEST() + +`BIT_TEST(number, index) → result` + +Tests if the at position *index* is set in **number**. + +- **number** (number): the number to test +- **index** (number): index of the bit to test (0 to 31) +- returns **result** (boolean\|null): whether or not the bit was set + +The input value must be a number between 0 and 232 - 1 (including). +The **index** must be between 0 and 31. The function will return `null` for any +other inputs and produce a warning. + +```aql +BIT_TEST(0, 3) // false +BIT_TEST(255, 0) // true +BIT_TEST(7, 2) // true +BIT_TEST(255, 8) // false +``` + +## BIT_TO_STRING() + +`BIT_TO_STRING(number) → bitstring` + +Converts a numeric input value into a bitstring, consisting of `0` and `1`. + +To convert a bitstring into a number, see [`BIT_FROM_STRING()`](#bit_from_string). + +- **number** (number): the number to stringify +- returns **bitstring** (string\|null): bitstring generated from the input value + +The input value must be a number between 0 and 232 - 1 (including). +The function will return `null` for any other inputs and produce a warning. + +```aql +BIT_TO_STRING(7, 4) // "0111" +BIT_TO_STRING(255, 8) // "11111111" +BIT_TO_STRING(60, 8) // "00011110" +BIT_TO_STRING(1048576, 32) // "00000000000100000000000000000000" +``` + +## BIT_XOR() + +`BIT_XOR(numbersArray) → result` + +Exclusive-or-combines the numeric values in *numbersArray* into a single +numeric result value. + +- **numbersArray** (array): array with numeric input values +- returns **result** (number\|null): xor-combined result + +The function expects an array with numeric values as its input. The values in +the array must be numbers, which must not be negative. The maximum supported +input number value is 232 - 1. Input number values outside the +allowed range will make the function return `null` and produce a warning. +Any `null` values in the input array are ignored. + +--- + +`BIT_XOR(value1, value2) → result` + +If two numbers are passed as individual function parameters to `BIT_XOR()`, it +will return the bitwise exclusive or value of its two operands. Only numbers in +the range 0 to 232 - 1 are allowed as input values. + +- **value1** (number): first operand +- **value2** (number): second operand +- returns **result** (number\|null): xor-combined result + +```aql +BIT_XOR([1, 4, 8, 16]) // 29 +BIT_XOR([3, 7, 63]) // 59 +BIT_XOR([255, 127, null, 63]) // 191 +BIT_XOR(255, 257) // 510 +BIT_XOR("foo") // null +``` diff --git a/site/content/arangodb/oem/aql/functions/date.md b/site/content/arangodb/oem/aql/functions/date.md new file mode 100644 index 0000000000..272e384e0b --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/date.md @@ -0,0 +1,1335 @@ +--- +title: Date functions in AQL +menuTitle: Date +weight: 20 +description: >- + AQL includes functions to work with dates as numeric timestamps and as + ISO 8601 date time strings +--- +## Date and time representations + +AQL offers functionality to work with dates, but it does not have a special data type +for dates (neither does JSON, which is usually used as format to ship data into and +out of ArangoDB). Instead, dates in AQL are represented by either numbers or strings. + +All date function operations are done in the *Unix time* system. Unix time counts +all non leap seconds beginning with January 1st 1970 00:00:00.000 UTC, also know as +the Unix epoch. A point in time is called timestamp. A timestamp has the same value +at every point on earth. The date functions use millisecond precision for timestamps. + +Time unit definitions: + +- **millisecond**: 1/1000 of a second +- **second**: one [SI second](https://www.bipm.org/en/si-base-units/second) +- **minute**: one minute is defined as 60 seconds +- **hour**: one hour is defined as 60 minutes +- **day**: one day is defined as 24 hours +- **week**: one week is defined as 7 days +- **month**: one month is defined as 1/12 of a year +- **year**: one year is defined as 365.2425 days + +All functions that require dates as arguments accept the following input values: + +- **numeric timestamps**, millisecond precision. + + An example timestamp value is `1399472349522`, which translates to + `2014-05-07T14:19:09.522Z`. + + Valid range: `-62167219200000` .. `253402300799999` (inclusive) + +- **date time strings** in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format: + - `YYYY-MM-DDTHH:MM:SS.MMM` + - `YYYY-MM-DD HH:MM:SS.MMM` + - `YYYY-MM-DD` + + Milliseconds (`.MMM`) are always optional. Two digits for the hours (`HH`), + minutes (`MM`) and seconds (`SS`) are mandatory, i.e. zero-padding is required + for the values 0 through 9 (e.g. `05` instead of `5`). Leading zeroes for the + year (`YYYY`), month (`MM`) and day (`DD`) can be left out, but is discouraged. + + A time offset may optionally be added at the end of the string, with the + hours and minutes that need to be added or subtracted to the date time value. + For example, `2014-05-07T14:19:09+01:00` can be used to specify a one hour offset, + and `2014-05-07T14:19:09+07:30` can be specified for seven and half hours offset. + Negative offsets are also possible. Alternatively to an offset, a `Z` can be used + to indicate UTC / Zulu time. An example value is `2014-05-07T14:19:09.522Z` + meaning May 7th 2014, 14:19:09 and 522 milliseconds, UTC / Zulu time. + Another example value without time component is `2014-05-07Z`. + + Valid range: `"0000-01-01T00:00:00.000Z"` .. `"9999-12-31T23:59:59.999Z"` (inclusive) + +Any date/time values outside the valid range that are passed into an AQL date +function makes the function return `null` and trigger a warning for the query, +which can optionally be escalated to an error and abort the query. This also +applies to operations which produce an invalid value. + +```aql +DATE_HOUR( 2 * 60 * 60 * 1000 ) // 2 +DATE_HOUR("1970-01-01T02:00:00") // 2 +``` + +You are free to store age determinations of specimens, incomplete or fuzzy dates and +the like in different, more appropriate ways of course. AQL's date functions are +most certainly not of any help for such dates, but you can still use language +constructs like [SORT](../high-level-operations/sort.md) (which also supports sorting of arrays) +and [indexes](../../index-and-search/indexing/_index.md). + +## Current date and time + +### DATE_NOW() + +`DATE_NOW() → timestamp` + +Get the current unix time as numeric timestamp. + +- returns **timestamp** (number): the current unix time as a timestamp. + The return value has millisecond precision. To convert the return value to + seconds, divide it by 1000. + +Note that this function is evaluated on every invocation and may return +different values when invoked multiple times in the same query. Assign it +to a variable to use the exact same timestamp multiple times. + +## Conversion + +`DATE_TIMESTAMP()` and `DATE_ISO8601()` can be used to convert ISO 8601 date time +strings to numeric timestamps and numeric timestamps to ISO 8601 date time strings. + +Both also support individual date components as separate function arguments, +in the following order: + +- year +- month +- day +- hour +- minute +- second +- millisecond + +All components following the *day* are optional and can be omitted. Note that no +time offset can be specified when using separate date components, and UTC / +Zulu time is used. + +The following calls to `DATE_TIMESTAMP()` are equivalent and all return +`1399472349522`: + +```aql +DATE_TIMESTAMP("2014-05-07T14:19:09.522") +DATE_TIMESTAMP("2014-05-07T14:19:09.522Z") +DATE_TIMESTAMP("2014-05-07 14:19:09.522") +DATE_TIMESTAMP("2014-05-07 14:19:09.522Z") +DATE_TIMESTAMP(2014, 5, 7, 14, 19, 9, 522) +DATE_TIMESTAMP(1399472349522) +``` + +The same is true for calls to `DATE_ISO8601()` that also accepts variable input +formats: + +```aql +DATE_ISO8601("2014-05-07T14:19:09.522Z") +DATE_ISO8601("2014-05-07 14:19:09.522Z") +DATE_ISO8601(2014, 5, 7, 14, 19, 9, 522) +DATE_ISO8601(1399472349522) +``` + +The above functions are all equivalent and return `"2014-05-07T14:19:09.522Z"`. + +### DATE_ISO8601() + +`DATE_ISO8601(date) → dateString` + +Return an ISO 8601 date time string from `date`. +The date time string always uses UTC / Zulu time, indicated by the `Z` at its end. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **dateString**: date and time expressed according to ISO 8601, in Zulu time + +--- + +`DATE_ISO8601(year, month, day, hour, minute, second, millisecond) → dateString` + +Return a ISO 8601 date time string from `date`, but allows to specify the individual +date components separately. All parameters after `day` are optional. + +- **year** (number): typically in the range 0..9999, e.g. `2017` +- **month** (number): 1..12 for January through December +- **day** (number): 1..31 (upper bound depends on number of days in month) +- **hour** (number, *optional*): 0..23 +- **minute** (number, *optional*): 0..59 +- **second** (number, *optional*): 0..59 +- **milliseconds** (number, *optional*): 0..999 +- returns **dateString**: date and time expressed according to ISO 8601, in Zulu time + +### DATE_TIMESTAMP() + +`DATE_TIMESTAMP(date) → timestamp` + +Create a timestamp value from `date`. The return value has millisecond precision. +To convert the return value to seconds, divide it by 1000. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **timestamp** (number): numeric timestamp + +--- + +`DATE_TIMESTAMP(year, month, day, hour, minute, second, millisecond) → timestamp` + +Create a timestamp value, but allows to specify the individual date components +separately. All parameters after `day` are optional. + +- **year** (number): typically in the range 0..9999, e.g. `2017` +- **month** (number): 1..12 for January through December +- **day** (number): 1..31 (upper bound depends on number of days in month) +- **hour** (number, *optional*): 0..23 +- **minute** (number, *optional*): 0..59 +- **second** (number, *optional*): 0..59 +- **milliseconds** (number, *optional*): 0..999 +- returns **timestamp** (number): numeric timestamp + +Negative values are not allowed, result in `null` and cause a warning. +Values greater than the upper range bound overflow to the larger components +(e.g. an hour of 26 is automatically turned into an additional day and two hours): + +```aql +DATE_TIMESTAMP(2016, 12, -1) // returns null and issues a warning +DATE_TIMESTAMP(2016, 2, 32) // returns 1456963200000, which is March 3rd, 2016 +DATE_TIMESTAMP(1970, 1, 1, 26) // returns 93600000, which is January 2nd, 1970, at 2 a.m. +``` + +### IS_DATESTRING() + +`IS_DATESTRING(value) → bool` + +Check if an arbitrary string is suitable for interpretation as date time string. + +- **value** (string): an arbitrary string +- returns **bool** (bool): `true` if `value` is a string that can be used + in a date function. This includes partial dates such as `2015` or `2015-10` and + strings containing invalid dates such as `2015-02-31`. The function returns + `false` for all non-string values, even if some of them may be usable in date + functions. + +## Processing + +### DATE_DAYOFWEEK() + +`DATE_DAYOFWEEK(date) → weekdayNumber` + +Return the weekday number of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **weekdayNumber** (number): 0..6 as follows: + - `0` – Sunday + - `1` – Monday + - `2` – Tuesday + - `3` – Wednesday + - `4` – Thursday + - `5` – Friday + - `6` – Saturday + +**Examples** + +```aql +--- +name: datedyofwk1 +description: | + The 29th of August in 2020 was a Saturday: +--- +RETURN DATE_DAYOFWEEK("2020-08-29") +``` + +```aql +--- +name: datedyofwk2 +description: | + The Unix epoch began on the 1st of January 1970, which was a Thursday: +--- +RETURN DATE_DAYOFWEEK(0) +``` + +### DATE_YEAR() + +`DATE_YEAR(date) → year` + +Return the year of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **year** (number): the year part of `date` as a number + +**Examples** + +```aql +--- +name: dateyr1 +description: | + Extract the year from a date time string: +--- +RETURN DATE_YEAR("2020-08-29") +``` + +```aql +--- +name: dateyr2 +description: | + Extract the year from a Unix timestamp: +--- +RETURN DATE_YEAR(0) +``` + +### DATE_MONTH() + +`DATE_MONTH(date) → month` + +Return the month of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **month** (number): the month part of `date` as a number + +**Examples** + +```aql +--- +name: datemn1 +description: | + Extract the month from a date time string: +--- +RETURN DATE_MONTH("2020-08-29") +``` + +```aql +--- +name: datemn2 +description: | + Extract the month from a Unix timestamp: +--- +RETURN DATE_MONTH(0) +``` + +### DATE_DAY() + +`DATE_DAY(date) → day` + +Return the day of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **day** (number): the day part of `date` as a number + +**Examples** + +```aql +--- +name: datedy1 +description: | + Extract the day from a date time string: +--- +RETURN DATE_DAY("2020-08-29") +``` + +```aql +--- +name: datedy2 +description: | + Extract the day from a Unix timestamp: +--- +RETURN DATE_DAY(0) +``` + +### DATE_HOUR() + +Return the hour of `date`. + +`DATE_HOUR(date) → hour` + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **hour** (number): the hour part of `date` as a number + +**Examples** + +```aql +--- +name: datehr1 +description: | + Extract the hour of a date time string: +--- +RETURN DATE_HOUR("2020-08-29T16:30:05.123") +``` + +```aql +--- +name: datehr2 +description: | + Extract the hour of a Unix timestamp: +--- +RETURN DATE_HOUR(14400000) +``` + +### DATE_MINUTE() + +`DATE_MINUTE(date) → minute` + +Return the minute of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **minute** (number): the minute part of `date` as a number + +**Examples** + +```aql +--- +name: datemin1 +description: | + Extract the minute of a date time string: +--- +RETURN DATE_MINUTE("2020-08-29T16:30:05.123") +``` + +```aql +--- +name: datemin2 +description: | + Extract the minute of a Unix timestamp: +--- +RETURN DATE_MINUTE(2520000) +``` + +### DATE_SECOND() + +`DATE_SECOND(date) → second` + +Return the second of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **second** (number): the seconds part of `date` as a number + +**Examples** + +```aql +--- +name: datesec1 +description: | + Extract the second of a date time string: +--- +RETURN DATE_SECOND("2020-08-29T16:30:05.123") +``` + +```aql +--- +name: datesec2 +description: | + Extract the second of a Unix timestamp: +--- +RETURN DATE_SECOND(1234567890) +``` + +### DATE_MILLISECOND() + +`DATE_MILLISECOND(date) → millisecond` + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **millisecond** (number): the milliseconds part of `date` as a number + +**Examples** + +```aql +--- +name: datemilsec1 +description: '' +--- +RETURN DATE_MILLISECOND("2020-08-29T16:30:05.123") +``` + +```aql +--- +name: datemilsec2 +description: | + Extract the millisecond of a Unix timestamp: +--- +RETURN DATE_MILLISECOND(1234567890) +``` + +### DATE_DAYOFYEAR() + +`DATE_DAYOFYEAR(date) → dayOfYear` + +Return the day of year of `date`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **dayOfYear** (number): the day of year number of `date`. + The return values range from 1 to 365, or 366 in a leap year respectively. + +**Examples** + +```aql +--- +name: datedyofyr1 +description: | + Extract the day of year from a date time string: +--- +RETURN DATE_DAYOFYEAR("2020-08-29") +``` + +```aql +--- +name: datedyofyr2 +description: | + Extract the day of year from a Unix timestamp: +--- +RETURN DATE_DAYOFYEAR(86400000) +``` + +### DATE_ISOWEEK() + +`DATE_ISOWEEK(date) → weekDate` + +Return the week number in the year of `date` according to ISO 8601. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **weekDate** (number): the ISO week number of `date`. The return values + range from 1 to 53. Monday is considered the first day of the week. There are no + fractional weeks, thus the last days in December may belong to the first week of + the next year, and the first days in January may be part of the previous year's + last week. + +**Examples** + +```aql +--- +name: dateisofwk1 +description: | + Determine the week number from a date time string: +--- +RETURN DATE_ISOWEEK("2020-08-29") +``` + +```aql +--- +name: dateisofwk2 +description: | + Determine the week number from a Unix timestamp: +--- +RETURN DATE_ISOWEEK(1234567890) +``` + +### DATE_ISOWEEKYEAR() + +`DATE_ISOWEEKYEAR(date) → weekAndYear` + +Return the week number of `date` according to ISO 8601 and the year the +week belongs to. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **weekAndYear** (object): an object with two attributes + - **week** (number): the ISO week number of `date`. The values range from 1 to 53. + Monday is considered the first day of the week. There are no fractional weeks, + thus the last days in December may belong to the first week of the next year, + and the first days in January may be part of the previous year's last week. + - **year** (number): the year to which the ISO week number belongs to + +**Examples** + +```aql +--- +name: aqlDateIsoWeekYear1 +description: | + January 1st of 2023 is part of the previous year's last week: +--- +RETURN DATE_ISOWEEKYEAR("2023-01-01") +``` + +```aql +--- +name: aqlDateIsoWeekYear2 +description: | + The last two days of 2019 are part of the next year's first week: +--- +RETURN DATE_ISOWEEKYEAR("2019-12-30") +``` + +### DATE_LEAPYEAR() + +`DATE_LEAPYEAR(date) → leapYear` + +Return whether `date` is in a leap year. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **leapYear** (bool): `true` if `date` is in a leap year, `false` otherwise + +**Examples** + +```aql +--- +name: datelpyr1 +description: | + 2020 was a leap year: +--- +RETURN DATE_LEAPYEAR("2020-01-01") +``` + +```aql +--- +name: datelpyr2 +description: | + 2021 was not a leap year: +--- +RETURN DATE_LEAPYEAR("2021-01-01") +``` + +### DATE_QUARTER() + +`DATE_QUARTER(date) → quarter` + +Return which quarter `date` belongs to. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **quarter** (number): the quarter of the given date (1-based): + - `1` – January, February, March + - `2` – April, May, June + - `3` – July, August, September + - `4` – October, November, December + +**Examples** + +```aql +--- +name: dateqtr1 +description: | + Determine the quarter of a date time string: +--- +RETURN DATE_QUARTER("2020-08-29") +``` + +### DATE_DAYS_IN_MONTH() + +Return the number of days in the month of `date`. + +`DATE_DAYS_IN_MONTH(date) → daysInMonth` + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- returns **daysInMonth** (number): the number of days in `date`'s month (28..31) + +**Examples** + +```aql +--- +name: datedysmn1 +description: | + Determine the number of days in August using a date time string: +--- +RETURN DATE_DAYS_IN_MONTH("2020-08-01") +``` + +```aql +--- +name: datedysmn2 +description: | + Determine the number of days in September using a date time string: +--- +RETURN DATE_DAYS_IN_MONTH("2020-09-01") +``` + +```aql +--- +name: datedysmn3 +description: | + Determine the number of days in February in a leap year using a date time string: +--- +RETURN DATE_DAYS_IN_MONTH("2020-02-01") +``` + +```aql +--- +name: datedysmn4 +description: | + Determine the number of days in February in a a non-leap year using a date time string: +--- +RETURN DATE_DAYS_IN_MONTH("2021-02-01") +``` + +```aql +--- +name: datedysmn5 +description: | + Determine the number of days in the month using a Unix timestamp: +--- +RETURN DATE_DAYS_IN_MONTH(3045600000) +``` + +### DATE_TRUNC() + +`DATE_TRUNC(date, unit) → isoDate` + +Truncates the given date after `unit` and returns the modified date. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **unit** (string): either of the following to specify the time unit (case-insensitive): + - `"y"`, `"year"`, `"years"` + - `"m"`, `"month"`, `"months"` + - `"d"`, `"day"`, `"days"` + - `"h"`, `"hour"`, `"hours"` + - `"i"`, `"minute"`, `"minutes"` + - `"s"`, `"second"`, `"seconds"` + - `"f"`, `"millisecond"`, `"milliseconds"` +- returns **isoDate** (string): the truncated ISO 8601 date time string + +**Examples** + +```aql +DATE_TRUNC('2017-02-03', 'month') // 2017-02-01T00:00:00.000Z +DATE_TRUNC('2017-02-03 04:05:06', 'hours') // 2017-02-03 04:00:00.000Z +DATE_TRUNC('2023-03-25 23:00:00', 'day') // 2023-03-25T00:00:00.000Z +``` + +```aql +--- +name: dateTruncGroup +description: | + Truncate date time strings comprised of a year, month, and day to the year and + group another attribute by it: +bindVars: + { + "data": [ + { "date": "2018-03-05", "value": "Spring" }, + { "date": "2018-07-11", "value": "Summer" }, + { "date": "2018-10-26", "value": "Autumn" }, + { "date": "2019-01-09", "value": "Winter" }, + { "date": "2019-04-02", "value": "Spring" } + ] + } +--- +RETURN MERGE( + FOR doc IN @data + COLLECT q = DATE_TRUNC(doc.date, "year") INTO bucket + RETURN { [DATE_YEAR(q)]: bucket[*].doc.value } +) +``` + +### DATE_ROUND() + +`DATE_ROUND(date, amount, unit) → isoDate` + +Bin a date/time into a set of equal-distance buckets, to be used for +grouping. + +- **date** (string\|number): a date string or timestamp +- **amount** (number): number of `unit`s. Must be a positive integer value. +- **unit** (string): either of the following to specify the time unit (case-insensitive): + - `"d"`, `"day"`, `"days"` + - `"h"`, `"hour"`, `"hours"` + - `"i"`, `"minute"`, `"minutes"` + - `"s"`, `"second"`, `"seconds"` + - `"f"`, `"millisecond"`, `"milliseconds"` +- returns **isoDate** (string): the rounded ISO 8601 date time string + +**Examples** + +```aql +DATE_ROUND('2000-04-28T11:11:11.111Z', 1, 'day') // 2000-04-28T00:00:00.000Z +DATE_ROUND('2000-04-10T11:39:29Z', 15, 'minutes') // 2000-04-10T11:30:00.000Z +DATE_ROUND('2023-03-25T23:55:55.555Z', 1, 'day') // 2023-03-25T00:00:00.000Z +``` + +```aql +--- +name: dateRoundAggregate +description: | + Round full date time strings to 5 minutes and aggregate temperature readings + by these time buckets: + +bindVars: + { + "sensorData": [ + { "timestamp": "2019-12-04T21:17:52.583Z", "temp": 20.6 }, + { "timestamp": "2019-12-04T21:19:53.516Z", "temp": 20.2 }, + { "timestamp": "2019-12-04T21:21:53.610Z", "temp": 19.9 }, + { "timestamp": "2019-12-04T21:23:52.522Z", "temp": 19.8 }, + { "timestamp": "2019-12-04T21:25:52.988Z", "temp": 19.8 }, + { "timestamp": "2019-12-04T21:27:54.005Z", "temp": 19.7 } + ] + } +--- +FOR doc IN @sensorData + COLLECT + date = DATE_ROUND(doc.timestamp, 5, "minutes") + AGGREGATE + count = COUNT(1), + avg = AVG(doc.temp), + min = MIN(doc.temp), + max = MAX(doc.temp) + RETURN { date, count, avg, min, max } +``` + +### DATE_FORMAT() + +`DATE_FORMAT(date, format) → str` + +Format a date according to the given format string. + +- **date** (string\|number): a date string or timestamp +- **format** (string): a format string, see below +- returns **str** (string): a formatted date string + +The `format` parameter supports the following placeholders (case-insensitive): + +- `%t` – timestamp, in milliseconds since midnight 1970-01-01 +- `%z` – ISO date (0000-00-00T00:00:00.000Z) +- `%w` – day of week (0..6) +- `%y` – year (0..9999) +- `%yy` – year (00..99), abbreviated (last two digits) +- `%yyyy` – year (0000..9999), padded to length of 4 +- `%yyyyyy` – year (-009999 .. +009999), with sign prefix and padded to length of 6 +- `%m` – month (1..12) +- `%mm` – month (01..12), padded to length of 2 +- `%d` – day (1..31) +- `%dd` – day (01..31), padded to length of 2 +- `%h` – hour (0..23) +- `%hh` – hour (00..23), padded to length of 2 +- `%i` – minute (0..59) +- `%ii` – minute (00..59), padded to length of 2 +- `%s` – second (0..59) +- `%ss` – second (00..59), padded to length of 2 +- `%f` – millisecond (0..999) +- `%fff` – millisecond (000..999), padded to length of 3 +- `%x` – day of year (1..366) +- `%xxx` – day of year (001..366), padded to length of 3 +- `%k` – ISO week number of year (1..53) +- `%kk` – ISO week number of year (01..53), padded to length of 2 +- `%l` – leap year (0 or 1) +- `%q` – quarter (1..4) +- `%a` – days in month (28..31) +- `%mmm` – abbreviated English name of month (Jan..Dec) +- `%mmmm` – English name of month (January..December) +- `%www` – abbreviated English name of weekday (Sun..Sat) +- `%wwww` – English name of weekday (Sunday..Saturday) +- `%&` – special escape sequence for rare occasions +- `%%` – literal % +- `%` – ignored + +`%yyyy` does not enforce a length of 4 for years before 0 and past 9999. +The same format as for `%yyyyyy` is used instead. `%yy` preserves the +sign for negative years and may thus return 3 characters in total. + +Single `%` characters are ignored. Use `%%` for a literal `%`. To resolve +ambiguities like in `%mmonth` (unpadded month number + the string `month`) +between `%mm` + `onth` and `%m` + `month`, use the escape sequence `%&`: +`%m%&month`. + +Note that `DATE_FORMAT()` is a rather costly operation and may not be suitable for large +datasets (like over 1 million dates). If possible, avoid formatting dates on +server-side and leave it up to the client to do so. This function should only +be used for special date comparisons or to store the formatted dates in the +database. For better performance, use the primitive `DATE_*()` functions +together with `CONCAT()` if possible. + +**Examples** + +```aql +DATE_FORMAT(DATE_NOW(), "%q/%yyyy") // quarter and year (e.g. "3/2015") +DATE_FORMAT(DATE_NOW(), "%dd.%mm.%yyyy %hh:%ii:%ss,%fff") // e.g. "18.09.2015 15:30:49,374" +DATE_FORMAT("1969", "Summer of '%yy") // "Summer of '69" +DATE_FORMAT("2016", "%%l = %l") // "%l = 1" (2016 is a leap year) +DATE_FORMAT("2016-03-01", "%xxx%") // "063", trailing % ignored +``` + +```aql +--- +name: dateFormat +description: | + Example calls of the formatting function and their results: +bindVars: + { + "formats": [ + { "date": "2023-03-25T23:00:00.000Z", "placeholder": "%w", "equalTo": "DATE_DAYOFWEEK" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%yyyy", "equalTo": "DATE_YEAR" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%m", "equalTo": "DATE_MONTH" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%d", "equalTo": "DATE_DAY" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%h", "equalTo": "DATE_HOUR" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%i", "equalTo": "DATE_MINUTE" }, + { "date": "2023-12-31T23:00:23.000Z", "placeholder": "%s", "equalTo": "DATE_SECOND" }, + { "date": "2023-12-31T23:00:00.031Z", "placeholder": "%f", "equalTo": "DATE_MILLISECOND" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%x", "equalTo": "DATE_DAYOFYEAR" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%k", "equalTo": "DATE_ISOWEEK" }, + { "date": "2016-12-31T23:00:00.000Z", "placeholder": "%l", "equalTo": "DATE_LEAPYEAR" }, + { "date": "2023-12-31T23:00:00.000Z", "placeholder": "%q", "equalTo": "DATE_QUARTER" }, + { "date": "2023-11-30T23:00:00.000Z", "placeholder": "%a", "equalTo": "DATE_DAYS_IN_MONTH" }, + { "date": "2023-11-30T23:00:00.000Z", "placeholder": "%t", "equalTo": "DATE_TIMESTAMP" } + ] + } +--- +FOR format IN @formats + RETURN CONCAT( + format.equalTo, + "('", + format.date, + "') = ", + DATE_FORMAT(format.date, format.placeholder) + ) +``` + +## Comparison and calculation + +### DATE_ADD() + +`DATE_ADD(date, amount, unit) → isoDate` + +Add `amount` given in `unit` to `date` and return the calculated date. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **amount** (number\|string): number of `unit`s to add (positive value) or + subtract (negative value). It is recommended to use positive values only, + and use [`DATE_SUBTRACT()`](#date_subtract) for subtractions instead. +- **unit** (string): either of the following to specify the time unit to add or + subtract (case-insensitive): + - `"y"`, `"year"`, `"years"` + - `"m"`, `"month"`, `"months"` + - `"w"`, `"week"`, `"weeks"` + - `"d"`, `"day"`, `"days"` + - `"h"`, `"hour"`, `"hours"` + - `"i"`, `"minute"`, `"minutes"` + - `"s"`, `"second"`, `"seconds"` + - `"f"`, `"millisecond"`, `"milliseconds"` +- returns **isoDate** (string): the calculated ISO 8601 date time string + +```aql +DATE_ADD(DATE_NOW(), -1, "day") // yesterday; also see DATE_SUBTRACT() +DATE_ADD(DATE_NOW(), 3, "months") // in three months +DATE_ADD(DATE_ADD("2015-04-01", 5, "years"), 1, "month") // May 1st 2020 +DATE_ADD("2015-04-01", 12*5 + 1, "months") // also May 1st 2020 +DATE_ADD(DATE_TIMESTAMP(DATE_YEAR(DATE_NOW()), 12, 24), -4, "years") // Christmas four years ago +DATE_ADD(DATE_ADD("2016-02", "month", 1), -1, "day") // last day of February (29th, because 2016 is a leap year!) +``` + +--- + +`DATE_ADD(date, isoDuration) → isoDate` + +You may also pass an ISO duration string as `amount` and leave out `unit`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **isoDuration** (string): an ISO 8601 duration string to add to `date`, see below +- returns **isoDate** (string): the calculated ISO 8601 date time string + +The format is `P_Y_M_W_DT_H_M_._S`, where underscores stand for digits and +letters for time intervals - except for the separators `P` (period) and `T` (time). +The meaning of the other letters are: +- `Y` – years +- `M` – months (if before T) +- `W` – weeks +- `D` – days +- `H` – hours +- `M` – minutes (if after T) +- `S` – seconds (optionally with 3 decimal places for milliseconds) + +The string must be prefixed by a `P`. A separating `T` is only required if +`H`, `M` and/or `S` are specified. You only need to specify the needed pairs +of letters and numbers. + +```aql +DATE_ADD(DATE_NOW(), "P1Y") // add 1 year +DATE_ADD(DATE_NOW(), "P3M2W") // add 3 months and 2 weeks +DATE_ADD(DATE_NOW(), "P5DT26H") // add 5 days and 26 hours (=6 days and 2 hours) +DATE_ADD("2000-01-01", "PT4H") // add 4 hours +DATE_ADD("2000-01-01", "PT30M44.4S") // add 30 minutes, 44 seconds and 400 ms +DATE_ADD("2000-01-01", "P1Y2M3W4DT5H6M7.89S") // add a bit of everything +``` + +### DATE_SUBTRACT() + +`DATE_SUBTRACT(date, amount, unit) → isoDate` + +Subtract `amount` given in `unit` from `date` and return the calculated date. + +It works the same as [`DATE_ADD()`](#date_add), except that it subtracts. It is +equivalent to calling `DATE_ADD()` with a negative amount, except that +`DATE_SUBTRACT()` can also subtract ISO durations. Note that negative ISO +durations are not supported (i.e. starting with `-P`, like `-P1Y`). + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **amount** (number\|string): number of `unit`s to subtract (positive value) or + add (negative value). It is recommended to use positive values only, + and use [`DATE_ADD()`](#date_add) for additions instead. +- **unit** (string): either of the following to specify the time unit to add or + subtract (case-insensitive): + - `"y"`, `"year"`, `"years"` + - `"m"`, `"month"`, `"months"` + - `"w"`, `"week"`, `"weeks"` + - `"d"`, `"day"`, `"days"` + - `"h"`, `"hour"`, `"hours"` + - `"i"`, `"minute"`, `"minutes"` + - `"s"`, `"second"`, `"seconds"` + - `"f"`, `"millisecond"`, `"milliseconds"` +- returns **isoDate** (string): the calculated ISO 8601 date time string + +--- + +`DATE_SUBTRACT(date, isoDuration) → isoDate` + +You may also pass an ISO duration string as `amount` and leave out `unit`. + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **isoDuration** (string): an ISO 8601 duration string to subtract from `date`, + see below +- returns **isoDate** (string): the calculated ISO 8601 date time string + +The format is `P_Y_M_W_DT_H_M_._S`, where underscores stand for digits and +letters for time intervals - except for the separators `P` (period) and `T` (time). +The meaning of the other letters are: +- `Y` – years +- `M` – months (if before T) +- `W` – weeks +- `D` – days +- `H` – hours +- `M` – minutes (if after T) +- `S` – seconds (optionally with 3 decimal places for milliseconds) + +The string must be prefixed by a `P`. A separating `T` is only required if +`H`, `M` and/or `S` are specified. You only need to specify the needed pairs +of letters and numbers. + +```aql +DATE_SUBTRACT(DATE_NOW(), 1, "day") // yesterday +DATE_SUBTRACT(DATE_TIMESTAMP(DATE_YEAR(DATE_NOW()), 12, 24), 4, "years") // Christmas four years ago +DATE_SUBTRACT(DATE_ADD("2016-02", "month", 1), 1, "day") // last day of February (29th, because 2016 is a leap year!) +DATE_SUBTRACT(DATE_NOW(), "P4D") // four days ago +DATE_SUBTRACT(DATE_NOW(), "PT1H3M") // 1 hour and 30 minutes ago +``` + +### DATE_DIFF() + +`DATE_DIFF(date1, date2, unit, asFloat) → diff` + +Calculate the difference between two dates in given time `unit`, optionally +with decimal places. + +- **date1** (number\|string): numeric timestamp or ISO 8601 date time string +- **date2** (number\|string): numeric timestamp or ISO 8601 date time string +- **unit** (string): either of the following to specify the time unit to return the + difference in (case-insensitive): + - `"y"`, `"year"`, `"years"` + - `"m"`, `"month"`, `"months"` + - `"w"`, `"week"`, `"weeks"` + - `"d"`, `"day"`, `"days"` + - `"h"`, `"hour"`, `"hours"` + - `"i"`, `"minute"`, `"minutes"` + - `"s"`, `"second"`, `"seconds"` + - `"f"`, `"millisecond"`, `"milliseconds"` +- **asFloat** (boolean, *optional*): if set to `true`, decimal places are + preserved in the result. The default is `false` and an integer is returned. +- returns **diff** (number): the calculated difference as number in `unit`. + The value is negative if `date2` is before `date1`. + +```aql +--- +name: datediff1 +description: | + Determine how many days it is from New Year's Eve until April Fools' day: +--- +RETURN DATE_DIFF("2023-12-01", "2024-04-01", "days") +``` + +### DATE_COMPARE() + +`DATE_COMPARE(date1, date2, unitRangeStart, unitRangeEnd) → bool` + +Check if two partial dates match. + +- **date1** (number\|string): numeric timestamp or ISO 8601 date time string +- **date2** (number\|string): numeric timestamp or ISO 8601 date time string +- **unitRangeStart** (string): unit to start from, see below +- **unitRangeEnd** (string, *optional*): unit to end with, leave out to only + compare the component as specified by `unitRangeStart`. An error is raised if + `unitRangeEnd` is a unit before `unitRangeStart`. +- returns **bool** (bool): `true` if the dates match, `false` otherwise + +The parts to compare are defined by a range of time units. The full range is: +years, months, days, hours, minutes, seconds, milliseconds (in this order). + +All components of `date1` and `date2` as specified by the range are compared. +You can refer to the units as: + +- `"y"`, `"year"`, `"years"` +- `"m"`, `"month"`, `"months"` +- `"d"`, `"day"`, `"days"` +- `"h"`, `"hour"`, `"hours"` +- `"i"`, `"minute"`, `"minutes"` +- `"s"`, `"second"`, `"seconds"` +- `"f"`, `"millisecond"`, `"milliseconds"` + +**Examples** + +```aql +// Compare months and days, true on birthdays if you're born on 4th of April +DATE_COMPARE("1985-04-04", DATE_NOW(), "months", "days") + +// Only matches on one day if the current year is a leap year! +// You may want to add or subtract one day from date1 to match every year. +DATE_COMPARE("1984-02-29", DATE_NOW(), "months", "days") + +// compare years, months and days (true, because it's the same day) +DATE_COMPARE("2001-01-01T15:30:45.678Z", "2001-01-01T08:08:08.008Z", "years", "days") +``` + +You can directly compare ISO date **strings** if you want to find dates before or +after a certain date, or in between two dates (`>=`, `>`, `<`, `<=`). +No special date function is required. Equality tests (`==` and `!=`) only +match the exact same date and time, however. You may use `SUBSTRING()` to +compare partial date strings, `DATE_COMPARE()` is basically a convenience +function for that. However, neither is really required to limit a search to a +certain day as demonstrated here: + +```aql +FOR doc IN coll + FILTER doc.date >= "2015-05-15" AND doc.date < "2015-05-16" + RETURN doc +``` + +Every ISO date on that day is greater than or equal to `2015-05-15` in a string +comparison (e.g. `2015-05-15T11:30:00.000Z`). Dates before `2015-05-15` are smaller +and therefore filtered out by the first condition. Every date past `2015-05-15` is +greater than this date in a string comparison, and therefore filtered out by the +second condition. The result is that the time components in the dates you compare +with are "ignored". The query returns every document with `date` ranging from +`2015-05-15T00:00:00.000Z` to `2015-05-15T23:99:99.999Z`. It would also include +`2015-05-15T24:00:00.000Z`, but that date is actually `2015-05-16T00:00:00.000Z` +and can only occur if inserted manually (you may want to pass dates through +[`DATE_ISO8601()`](#date_iso8601) to ensure a correct date representation). + +Leap days in leap years (29th of February) must be always handled manually, +if you require so (e.g. birthday checks): + +```aql +LET today = DATE_NOW() +LET noLeapYear = NOT DATE_LEAPYEAR(today) + +FOR user IN users + LET birthday = noLeapYear AND + DATE_MONTH(user.birthday) == 2 AND + DATE_DAY(user.birthday) == 29 + ? DATE_SUBTRACT(user.birthday, 1, "day") /* treat like 28th in non-leap years */ + : user.birthday + FILTER DATE_COMPARE(today, birthday, "month", "day") + /* includes leaplings on the 28th of February in non-leap years, + * but excludes them in leap years which do have a 29th February. + * Replace DATE_SUBTRACT() by DATE_ADD() to include them on the 1st of March + * in non-leap years instead (depends on local jurisdiction). + */ + RETURN user +``` + +### DATE_UTCTOLOCAL() + +Introduced in: v3.8.0 + +Converts `date` assumed in Zulu time (UTC) to local `timezone`. + +It takes historic daylight saving times into account. + +`DATE_UTCTOLOCAL(date, timezone, zoneinfo) → date` + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **timezone** (string): + [IANA timezone name](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones), + e.g. `"America/New_York"`, `"Europe/Berlin"` or `"UTC"`. + Use `"America/Los_Angeles"` for Pacific time (PST/PDT). + Throws an error if the timezone is not known to ArangoDB. +- **zoneinfo** (boolean, *optional*): if set to `true`, an object with timezone + information is returned. The default is `false` and a date string is returned +- returns **date** (string\|object): an ISO 8601 date time string in + unqualified local time, or an object with the following attributes: + - **local** (string): ISO 8601 date time string in unqualified local time + - **tzdb** (string): version of the timezone database used (e.g. `"2020f"`) + - **zoneInfo**: (object): timezone information + - **name** (string): timezone abbreviation (GMT, PST, CET, ...) + - **begin** (string\|null): begin of the timezone effect as UTC date time string + - **end** (string\|null): end of the timezone effect as UTC date time string + - **dst** (boolean): `true` when daylight saving time (DST) is active, + `false` otherwise + - **offset** (number): offset to UTC in seconds + +```aql +--- +name: aqlDateTimeToLocal_1 +description: | + Convert a date time string to different local timezones: +--- +RETURN [ + DATE_UTCTOLOCAL("2020-03-15T00:00:00.000", "Europe/Berlin"), + DATE_UTCTOLOCAL("2020-03-15T00:00:00.000", "America/New_York"), + DATE_UTCTOLOCAL("2020-03-15T00:00:00.000", "UTC") +] +``` + +```aql +--- +name: aqlDateTimeToLocal_2 +description: | + Convert date time strings with and without UTC indicator (`Z`), with a timezone + offset, and a Unix timestamp to local time: +--- +RETURN [ + DATE_UTCTOLOCAL("2020-03-15T00:00:00.000", "Asia/Shanghai"), + DATE_UTCTOLOCAL("2020-03-15T00:00:00.000Z", "Asia/Shanghai"), + DATE_UTCTOLOCAL("2020-03-15T00:00:00.000-02:00", "Asia/Shanghai"), + DATE_UTCTOLOCAL(1584230400000, "Asia/Shanghai") +] +``` + +```aql +--- +name: aqlDateTimeToLocal_3 +description: | + Convert to local time and include timezone information: +--- +RETURN DATE_UTCTOLOCAL(DATE_NOW(), "Africa/Lagos", true) +``` + +### DATE_LOCALTOUTC() + +Introduced in: v3.8.0 + +Converts `date` assumed in local `timezone` to Zulu time (UTC). + +It takes historic daylight saving times into account. + +`DATE_LOCALTOUTC(date, timezone, zoneinfo) → date` + +- **date** (number\|string): numeric timestamp or ISO 8601 date time string +- **timezone** (string): + [IANA timezone name](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones), + e.g. `"America/New_York"`, `"Europe/Berlin"` or `"UTC"`. + Use `"America/Los_Angeles"` for Pacific time (PST/PDT). + Throws an error if the timezone is not known to ArangoDB. +- **zoneinfo** (boolean, *optional*): if set to `true`, an object with timezone + information is returned. The default is `false` and a date string is returned +- returns **date** (string\|object): an ISO 8601 date time string in + Zulu time (UTC), or an object with the following attributes: + - **utc** (string): ISO 8601 date time string in Zulu time (UTC) + - **tzdb** (string): version of the timezone database used (e.g. `"2020f"`) + - **zoneInfo**: (object): timezone information + - **name** (string): timezone abbreviation (GMT, PST, CET, ...) + - **begin** (string\|null): begin of the timezone effect as UTC date time string + - **end** (string\|null): end of the timezone effect as UTC date time string + - **dst** (boolean): `true` when daylight saving time (DST) is active, + `false` otherwise + - **offset** (number): offset to UTC in seconds + +```aql +--- +name: aqlDateTimeToUTC_1 +description: | + Convert a date time string from different local timezones to UTC: +--- +RETURN [ + DATE_LOCALTOUTC("2020-03-15T00:00:00.000", "Europe/Berlin"), + DATE_LOCALTOUTC("2020-03-15T00:00:00.000", "America/New_York"), + DATE_LOCALTOUTC("2020-03-15T00:00:00.000", "UTC") +] +``` + +```aql +--- +name: aqlDateTimeToUTC_2 +description: | + Convert date time strings with and without UTC indicator (`Z`), with a timezone + offset, and a Unix timestamp to UTC time: +--- +RETURN [ + DATE_LOCALTOUTC("2020-03-15T00:00:00.000", "Asia/Shanghai"), + DATE_LOCALTOUTC("2020-03-15T00:00:00.000Z", "Asia/Shanghai"), + DATE_LOCALTOUTC("2020-03-15T00:00:00.000-02:00", "Asia/Shanghai"), + DATE_LOCALTOUTC(1584230400000, "Asia/Shanghai") +] +``` + +```aql +--- +name: aqlDateTimeToUTC_3 +description: | + Convert to UTC time and include timezone information: +--- +RETURN DATE_LOCALTOUTC("2021-03-16T12:00:00.000", "Africa/Lagos", true) +``` + +### DATE_TIMEZONE() + +Introduced in: v3.8.0 + +Returns system timezone ArangoDB is running on. + +For cloud servers, this is most likely `"Etc/UTC"`. + +`DATE_TIMEZONE() → timezone` + +- returns **timezone** (string): + [IANA timezone name](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) + of the server timezone. + +### DATE_TIMEZONES() + +Introduced in: v3.8.0 + +Returns all valid timezone names. + +`DATE_TIMEZONES() → timezones` + +- returns **timezones** (array): an array of + [IANA timezone names](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) + +## Working with dates and indexes + +There are two recommended ways to store timestamps in ArangoDB: + - string: UTC timestamp with [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) + - number: [unix timestamp](https://en.wikipedia.org/wiki/Unix_time) with millisecond precision + +The sort order of both is identical due to the sort properties of ISO date strings. +You can't mix both types, numbers and strings, in a single attribute however. + +You can use [persistent indexes](../../index-and-search/indexing/working-with-indexes/persistent-indexes.md) with both date types. +When choosing string representations, you can work with string comparisons (less than, +greater than etc.) to express time ranges in your queries while still utilizing +persistent indexes: + +```js +--- +name: working_with_date_time +description: '' +--- +db._create("exampleTime"); +var timestamps = [ + "2014-05-07T14:19:09.522", + "2014-05-07T21:19:09.522", + "2014-05-08T04:19:09.522", + "2014-05-08T11:19:09.522", + "2014-05-08T18:19:09.522" +]; +for (i = 0; i < 5; i++) { + db.exampleTime.save({value:i, ts: timestamps[i]}); +} +db._query(` + FOR d IN exampleTime + FILTER d.ts > '2014-05-07T14:19:09.522' AND d.ts < '2014-05-08T18:19:09.522' + RETURN d +`).toArray() +~addIgnoreCollection("example") +~db._drop("exampleTime") +``` + +The first and the last timestamp in the array are excluded from the result by the `FILTER`. + +## Limitations + +Note that dates before the year 1583 aren't allowed by the +[ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) standard by default, because +they lie before the official introduction of the Gregorian calendar and may thus +be incorrect or invalid. All AQL date functions apply the same rules to every +date according to the Gregorian calendar system, even if inappropriate. That +does not constitute a problem, unless you deal with dates prior to 1583 and +especially years before Christ. The standard allows negative years, but requires +special treatment of positive years too, if negative years are used (e.g. +`+002015-05-15` and `-000753-01-01`). This is rarely used however, and AQL does +not use the 7-character version for years between 0 and 9999 in ISO strings. +Keep in mind that they can't be properly compared to dates outside that range. +Sorting of negative dates does not result in a meaningful order, with years longer +ago last, but months, days and the time components in otherwise correct order. + +Leap seconds are ignored, just as they are in JavaScript as per +[ECMAScript Language Specifications](http://www.ecma-international.org/ecma-262/5.1/#sec-15.9.1.1). diff --git a/site/content/arangodb/oem/aql/functions/document-object.md b/site/content/arangodb/oem/aql/functions/document-object.md new file mode 100644 index 0000000000..4394f6fc4c --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/document-object.md @@ -0,0 +1,1023 @@ +--- +title: Document and object functions in AQL +menuTitle: Document / Object +weight: 25 +description: >- + AQL provides functions to operate on objects respectively document values +--- +You can use the below listed functions with the +[object data type](../../concepts/data-structure/documents/_index.md#data-types). +Also see [object access](../fundamentals/data-types.md#objects--documents) for +additional language constructs for objects. + +## ATTRIBUTES() + +`ATTRIBUTES(document, removeSystemAttrs, sort) → strArray` + +Return the top-level attribute keys of the `document` as an array. +Optionally omit system attributes and sort the array. + +To return the attribute values instead, see the [`VALUES()` function](#values). + +- **document** (object): an arbitrary document / object +- **removeSystemAttrs** (bool, *optional*): whether all system attributes + (starting with an underscore, such as `_key` and `_id`) shall be omitted in + the result. The default is `false`. +- **sort** (bool, *optional*): optionally sort the resulting array alphabetically. + The default is `false` and will return the attribute names in any order. +- returns **strArray** (array): the attribute keys of the input `document` as an + array of strings + +**Examples** + +Return the attribute keys of an object: + +```aql +--- +name: aqlAttributes +description: '' +--- +RETURN ATTRIBUTES( { "foo": "bar", "_key": "123", "_custom": "yes" } ) +``` + +Return the attribute keys of an object but omit system attributes: + +```aql +--- +name: aqlAttributesRemoveInternal +description: '' +--- +RETURN ATTRIBUTES( { "foo": "bar", "_key": "123", "_custom": "yes" }, true ) +``` + +Return the attribute keys of an object in alphabetic order: + +```aql +--- +name: aqlAttributesSort +description: '' +--- +RETURN ATTRIBUTES( { "foo": "bar", "_key": "123", "_custom": "yes" }, false, true ) +``` + +Complex example to count how often every top-level attribute key occurs in the +documents of a collection (expensive on large collections): + +```aql +LET attributesPerDocument = ( + FOR doc IN collection RETURN ATTRIBUTES(doc, true) +) +FOR attributeArray IN attributesPerDocument + FOR attribute IN attributeArray + COLLECT attr = attribute WITH COUNT INTO count + SORT count DESC + RETURN {attr, count} +``` + +## COUNT() + +This is an alias for [`LENGTH()`](#length). + +## HAS() + +`HAS(document, attributeName) → isPresent` + +Test whether an attribute is present in the provided document. + +- **document** (object): an arbitrary document / object +- **attributeName** (string): the attribute key to test for +- returns **isPresent** (bool): `true` if `document` has an attribute named + `attributeName`, and `false` otherwise. Also returns `true` if the attribute + has a falsy value (`null`, `0`, `false`, empty string `""`) + +The function checks if the specified attribute exists, regardless of its value. +Other ways of testing for the existence of an attribute may behave differently +if the attribute has a falsy value or is not present (implicitly `null` on +object access): + +```aql +!!{ name: "" }.name // false +HAS( { name: "" }, "name") // true + +{ name: null }.name == null // true +{ }.name == null // true +HAS( { name: null }, "name" ) // true +HAS( { }, "name" ) // false +``` + +Note that `HAS()` cannot utilize indexes. If it is not necessary to distinguish +between explicit and implicit *null* values in your query, you may use an equality +comparison to test for *null* and create a non-sparse index on the attribute you +want to test against: + +```aql +FILTER !HAS(doc, "name") // cannot use indexes +FILTER IS_NULL(doc, "name") // cannot use indexes +FILTER doc.name == null // can utilize non-sparse indexes +``` + +**Examples** + +Check whether the example object has a `name` attribute key: + +```aql +--- +name: aqlHas_1 +description: '' +--- +RETURN HAS( { name: "Jane" }, "name" ) +``` + +Check whether the example object has an `age` attribute key: + +```aql +--- +name: aqlHas_2 +description: '' +--- +RETURN HAS( { name: "Jane" }, "age" ) +``` + +Falsy attribute values like `null` still count as the attribute being present: + +```aql +--- +name: aqlHas_3 +description: '' +--- +RETURN HAS( { name: null }, "name" ) +``` + +## IS_SAME_COLLECTION() + +`IS_SAME_COLLECTION(collectionName, documentIdentifier) → isSame` + +Test whether the `documentIdentifier` has `collectionName` as collection. + +The function does not validate whether the collection actually contains the +specified document. It only compares the name of the specified collection +with the collection name part of the specified document. + +- **collectionName** (string): the name of a collection as string +- **documentIdentifier** (string\|object): a document identifier string + (e.g. `_users/1234`) or an object with an `_id` attribute (e.g. a document + from a collection). +- returns **isSame** (bool): `true` if the collection of `documentIdentifier` is the + same as `collectionName`, or `false` if it is not. If `documentIdentifier` is an + object without an `_id` attribute or anything other than a string or object, + then `null` is returned and a warning is raised. + +**Examples** + +```aql +--- +name: aqlIsSameCollection +description: '' +--- +RETURN [ + IS_SAME_COLLECTION( "_users", "_users/my-user" ), + IS_SAME_COLLECTION( "_users", { _id: "_users/my-user" } ), + IS_SAME_COLLECTION( "_users", "foobar/baz"), + IS_SAME_COLLECTION( "_users", { _id: "something/else" } ) +] +``` + +## KEEP() + +`KEEP(document, attributeName1, attributeName2, ... attributeNameN) → doc` + +Keep only the attributes `attributeName` to `attributeNameN` of `document`. +All other attributes will be removed from the result. + +To do the opposite, see [`UNSET()`](#unset). + +- **document** (object): a document / object +- **attributeNames** (string, *repeatable*): an arbitrary number of attribute + names as multiple arguments +- returns **doc** (object): a document with only the specified attributes at + the top-level + +**Examples** + +Keep the top-level `foo` attribute, preserving its nested object: + +```aql +--- +name: aqlKeep_1 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP(doc, "foo") +``` + +Keep the top-level `bar` attribute, which the example object does not have, +resulting in an empty object: + +```aql +--- +name: aqlKeep_2 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP(doc, "bar") +``` + +Keep the top-level `baz` attribute: + +```aql +--- +name: aqlKeep_3 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP(doc, "baz") +``` + +Keep multiple top-level attributes (`foo` and `baz`): + +```aql +--- +name: aqlKeep_4 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP(doc, "foo", "baz") +``` + +--- + +`KEEP(document, attributeNameArray) → doc` + +- **document** (object): a document / object +- **attributeNameArray** (array): an array of attribute names as strings +- returns **doc** (object): a document with only the specified attributes at + the top-level + +**Examples** + +Keep multiple top-level attributes (`foo` and `baz`), by passing an array of the +attribute keys instead of individual arguments: + +```aql +--- +name: aqlKeep_5 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP(doc, ["foo", "baz"]) +``` + +## KEEP_RECURSIVE() + +`KEEP_RECURSIVE(document, attributeName1, attributeName2, ... attributeNameN) → doc` + +Recursively preserve the attributes `attributeName1` to `attributeNameN` from +`document` and its sub-documents. All other attributes will be removed. + +To do the opposite, use [`UNSET_RECURSIVE()`](#unset_recursive). + +- **document** (object): a document / object +- **attributeNames** (string, *repeatable*): an arbitrary number of attribute + names as multiple arguments (at least 1) +- returns **doc** (object): `document` with only the specified attributes at + all levels (top-level as well as nested objects) + +**Examples** + +Recursively preserve `foo` attributes, but not nested attributes that have +parents with other names: + +```aql +--- +name: aqlKeepRecursive_1 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, "foo") +``` + +Recursively preserve `bar` attributes, but there is none at the top-level, leading +to an empty object: + +```aql +--- +name: aqlKeepRecursive_2 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, "bar") +``` + +Recursively preserve `baz` attributes, but not nested attributes that have +parents with other names: + +```aql +--- +name: aqlKeepRecursive_3 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, "baz") +``` + +Recursively preserve multiple attributes (`foo` and `bar`): + +```aql +--- +name: aqlKeepRecursive_4 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, "foo", "bar") +``` + +Recursively preserve multiple attributes (`foo` and `baz`), but not nested +attributes that have parents with other names: + +```aql +--- +name: aqlKeepRecursive_5 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, "foo", "baz") +``` + +Recursively preserve multiple attributes (`foo`, `bar`, and `baz`), preserving all +attributes of the example object: + +```aql +--- +name: aqlKeepRecursive_6 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, "foo", "bar", "baz") +``` + +--- + +`KEEP_RECURSIVE(document, attributeNameArray) → doc` + +- **document** (object): a document / object +- **attributeNameArray** (array): an array of attribute names as strings +- returns **doc** (object): *document* with only the specified attributes at + all levels (top-level as well as nested objects) + +**Examples** + +Recursively preserve multiple attributes (`foo` and `baz`), by passing an array of the +attribute keys instead of individual arguments: + +```aql +--- +name: aqlKeepRecursive_7 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN KEEP_RECURSIVE(doc, ["foo", "baz"]) +``` + +## KEYS() + +This is an alias for [`ATTRIBUTES()`](#attributes). + +## LENGTH() + +`LENGTH(doc) → attrCount` + +Determine the number of attribute keys of an object / document. + +`LENGTH()` can also determine the [number of elements](array.md#length) in an array, +the [amount of documents](miscellaneous.md#length) in a collection and +the [character length](string.md#length) of a string. + +- **doc** (object): a document / object +- returns **attrCount** (number): the number of attribute keys in `doc`, regardless + of their values + +**Examples** + +```aql +--- +name: aqlLengthObject +description: '' +--- +RETURN LENGTH({ name: "Emma", age: 36, phone: { mobile: "..." } }) +``` + +## MATCHES() + +`MATCHES(document, examples, returnIndex) → match` + +Compare the given `document` against each example document provided. The comparisons +will be started with the first example. All attributes of the example will be compared +against the attributes of `document`. If all attributes match, the comparison stops +and the result is returned. If there is a mismatch, the function will continue the +comparison with the next example until there are no more examples left. + +The `examples` can be an array of 1..n example documents or a single document, +with any number of attributes each. + +An attribute value of `null` will match documents with an explicit attribute value +of `null` as well as documents with this attribute missing (implicitly `null`). +Only [`HAS()`](#has) can differentiate between an attribute being absent and having +a stored `null` value. + +An empty object `{}` will match all documents. Be careful not to ask for all +documents accidentally. For example, the [arangojs](../../develop/drivers/javascript.md) driver +skips attributes with a value of `undefined`, turning `{attr: undefined}` into `{}`. + +{{< info >}} +`MATCHES()` cannot utilize indexes. You may use plain `FILTER` conditions instead +to potentially benefit from existing indexes: + +```aql +FOR doc IN coll + FILTER (cond1 AND cond2 AND cond3) OR (cond4 AND cond5) ... +``` +{{< /info >}} + +- **document** (object): document to determine whether it matches any example +- **examples** (object\|array): a single document, or an array of documents to compare + against. Specifying an empty array is not allowed. +- **returnIndex** (bool): by setting this flag to `true`, the index of the example that + matched will be returned (starting at offset 0), or `-1` if there was no match. + The default is `false` and makes the function return a boolean. +- returns **match** (bool\|number): if `document` matches one of the examples, `true` is + returned, otherwise `false`. A number is returned instead if `returnIndex` is enabled. + +**Examples** + +Check whether all attributes of the example are present in the document: + +```aql +--- +name: aqlMatches_1 +description: '' +--- +LET doc = { + name: "jane", + age: 27, + active: true +} +RETURN MATCHES(doc, { age: 27, active: true } ) +``` + +Check whether one of the examples matches the document and return the index of +the matching example: + +```aql +--- +name: aqlMatches_2 +description: '' +--- +RETURN MATCHES( + { "test": 1 }, + [ + { "test": 1, "foo": "bar" }, + { "foo": 1 }, + { "test": 1 } + ], +true) +``` + +## MERGE() + +`MERGE(document1, document2, ... documentN) → mergedDocument` + +Merge the documents `document1` to `documentN` into a single document. +If document attribute keys are ambiguous, the merged result will contain the values +of the documents contained later in the argument list. + +Note that merging will only be done for top-level attributes. If you wish to +merge sub-attributes, use [`MERGE_RECURSIVE()`](#merge_recursive) instead. + +- **documents** (object, *repeatable*): an arbitrary number of documents as + multiple arguments (at least 2) +- returns **mergedDocument** (object): a combined document + +**Examples** + +Two documents with distinct attribute names can easily be merged into one: + +```aql +--- +name: aqlMerge_1 +description: '' +--- +RETURN MERGE( + { "user1": { "name": "Jane" } }, + { "user2": { "name": "Tom" } } +) +``` + +When merging documents with identical attribute names, the attribute values of the +latter documents will be used in the end result: + +```aql +--- +name: aqlMerge_2 +description: '' +--- +RETURN MERGE( + { "users": { "name": "Jane" } }, + { "users": { "name": "Tom" } } +) +``` + +--- + +`MERGE(docArray) → mergedDocument` + +`MERGE()` also accepts a single array parameter. This variant allows combining the +attributes of multiple objects in an array into a single object. + +- **docArray** (array): an array of documents, as sole argument +- returns **mergedDocument** (object): a combined document + +**Examples** + +```aql +--- +name: aqlMerge_3 +description: '' +--- +RETURN MERGE( + [ + { foo: "bar" }, + { quux: "quetzalcoatl", ruled: true }, + { bar: "baz", foo: "done" } + ] +) +``` + +{{< tip >}} +Consider to use [`ZIP()`](#zip) instead of `MERGE()` if you want to merge a set +of disjoint keys and their associated values into a single object. + +This could be a pattern like the following where objects with dynamic attribute +keys are created and then merged together (here to return a map of distinct +attribute values and how often they occur): + +```aql +RETURN MERGE( + FOR doc IN coll + COLLECT value = doc.attr WITH COUNT INTO count + RETURN { [value]: count } +) +``` + +This creates many temporary objects and can be slow if there are thousands of +objects to merge. The following pattern using `ZIP()` is more efficient: + +```aql +LET counts = ( + FOR doc IN coll + COLLECT value = doc.attr WITH COUNT INTO count + RETURN [value, count] +) +RETURN ZIP(counts[*][0], counts[*][1]) +``` +{{< /tip >}} + +## MERGE_RECURSIVE() + +`MERGE_RECURSIVE(document1, document2, ... documentN) → mergedDocument` + +Recursively merge the documents `document1` to `documentN` into a single document. +If document attribute keys overlap, the merged result contains the values +of the documents contained later in the argument list. + +- **documents** (object, *repeatable*): an arbitrary number of documents as + multiple arguments (at least 1) +- returns **mergedDocument** (object): a combined document + +**Examples** + +Merge two documents with the same top-level attribute, combining the `name`, +`age`, and `livesIn` sub-attributes: + +```aql +--- +name: aqlMergeRecursive_1 +description: '' +--- +RETURN MERGE_RECURSIVE( + { "user-1": { "name": "Jane", "livesIn": { "city": "LA" } } }, + { "user-1": { "age": 42, "livesIn": { "state": "CA" } } } +) +``` + +`MERGE_RECURSIVE(documents) → mergedDocument` + +Recursively merge the list of documents into a single document. +If document attribute keys overlap, the merged result contains the values +of the documents specified later in the list. + +- **documents** (array): an array with an arbitrary number of objects +- returns **mergedDocument** (object): a combined document + +**Examples** + +Merge a list of two documents with the same top-level attribute, combining the +`name` and `age` sub-attributes but overwriting the `city` value in the +`livesIn` sub-attribute: + +```aql +--- +name: aqlMergeRecursive_2 +description: '' +--- +RETURN MERGE_RECURSIVE( + [ + { "user-1": { "name": "Jane", "livesIn": { "city": "LA" } } }, + { "user-1": { "age": 42, "livesIn": { "city": "NY" } } } + ] +) +``` + +## PARSE_IDENTIFIER() + +`PARSE_IDENTIFIER(documentIdentifier) → parts` + +Parse a [document ID](../../concepts/data-structure/documents/_index.md#document-identifiers) +and separately return the collection name and the document key. + +- **documentIdentifier** (string\|object): a document identifier string (e.g. `_users/1234`) + or a regular document from a collection. Passing either a non-string or a non-document + or a document without an `_id` attribute results in an error. +- returns **parts** (object): an object with the attributes `collection` and `key` + +**Examples** + +Parse a document identifier string and extract both the collection name and the +document key: + +```aql +--- +name: aqlParseIdentifier_1 +description: '' +--- +RETURN PARSE_IDENTIFIER("_users/my-user") +``` + +Parse the `_id` attribute of a document to extract both the collection name and +the document key: + +```aql +--- +name: aqlParseIdentifier_2 +description: '' +--- +RETURN PARSE_IDENTIFIER( { "_id": "mycollection/mykey", "value": "some value" } ) +``` + +## TRANSLATE() + +`TRANSLATE(value, lookupDocument, defaultValue) → mappedValue` + +Look up the specified `value` in the `lookupDocument`. If `value` is a key in +`lookupDocument`, then `value` will be replaced with the lookup value found. +If `value` is not present in `lookupDocument`, then `defaultValue` will be returned +if specified. If no `defaultValue` is specified, `value` will be returned unchanged. + +- **value** (string): the value to encode according to the mapping +- **lookupDocument** (object): a key/value mapping as document +- **defaultValue** (any, *optional*): a fallback value in case `value` is not found +- returns **mappedValue** (any): the encoded value, or the unaltered `value` or `defaultValue` + (if supplied) in case it could not be mapped + +**Examples** + +Translate a country code to a country name: + +```aql +--- +name: aqlTranslate_1 +description: '' +--- +RETURN TRANSLATE("FR", { US: "United States", UK: "United Kingdom", FR: "France" } ) +``` + +The unaltered input value is returned if no match is found in the mapping: + +```aql +--- +name: aqlTranslate_2 +description: '' +--- +RETURN TRANSLATE(42, { foo: "bar", bar: "baz" } ) +``` + +If you specify a fallback value and no match is found in the mapping, then the +fallback value returned instead of the input value: + +```aql +--- +name: aqlTranslate_3 +description: '' +--- +RETURN TRANSLATE(42, { foo: "bar", bar: "baz" }, "not found!") +``` + +Note that any non-string input value is implicitly cast to a string before the +lookup: + +```aql +--- +name: aqlTranslate_4 +description: '' +--- +RETURN TRANSLATE(42, { "42": true } ) +``` + +## UNSET() + +`UNSET(document, attributeName1, attributeName2, ... attributeNameN) → doc` + +Remove the attributes `attributeName1` to `attributeNameN` from `document`. +All other attributes will be preserved. + +To do the opposite, see [`KEEP()`](#keep). + +- **document** (object): a document / object +- **attributeNames** (string, *repeatable*): an arbitrary number of attribute + names as multiple arguments (at least 1) +- returns **doc** (object): `document` without the specified attributes at the + top-level + +**Examples** + +Remove the top-level `foo` attribute, including its nested objects: + +```aql +--- +name: aqlUnset_1 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET(doc, "foo") +``` + +Remove the top-level `bar` attribute, which the example object does not have, +resulting in an unchanged object: + +```aql +--- +name: aqlUnset_2 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET(doc, "bar") +``` + +Remove the top-level `baz` attribute: + +```aql +--- +name: aqlUnset_3 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET(doc, "baz") +``` + +Remove multiple top-level attributes (`foo` and `baz`), resulting in an empty +object in this example: + +```aql +--- +name: aqlUnset_4 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET(doc, "foo", "baz") +``` + +--- + +`UNSET(document, attributeNameArray) → doc` + +- **document** (object): a document / object +- **attributeNameArray** (array): an array of attribute names as strings +- returns **doc** (object): *document* without the specified attributes at the + top-level + +**Examples** + +Remove multiple top-level attributes (`foo` and `baz`), by passing an array of the +attribute keys instead of individual arguments: + +```aql +--- +name: aqlUnset_5 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET(doc, ["foo", "bar"]) +``` + +## UNSET_RECURSIVE() + +`UNSET_RECURSIVE(document, attributeName1, attributeName2, ... attributeNameN) → doc` + +Recursively remove the attributes `attributeName1` to `attributeNameN` from +`document` and its sub-documents. All other attributes will be preserved. + +To do the opposite, use [`KEEP_RECURSIVE()`](#keep_recursive). + +- **document** (object): a document / object +- **attributeNames** (string, *repeatable*): an arbitrary number of attribute + names as multiple arguments (at least 1) +- returns **doc** (object): `document` without the specified attributes at + all levels (top-level as well as nested objects) + +**Examples** + +Recursively remove `foo` attributes: + +```aql +--- +name: aqlUnsetRecursive_1 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, "foo") +``` + +Recursively remove `bar` attributes: + +```aql +--- +name: aqlUnsetRecursive_2 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, "bar") +``` + +Recursively remove `baz` attributes: + +```aql +--- +name: aqlUnsetRecursive_3 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, "baz") +``` + +Recursively remove multiple attributes (`foo` and `bar`): + +```aql +--- +name: aqlUnsetRecursive_4 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, "foo", "bar") +``` + +Recursively remove multiple attributes (`foo` and `baz`), removing all +attributes of the example object: + +```aql +--- +name: aqlUnsetRecursive_5 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, "foo", "baz") +``` + +Recursively remove multiple attributes (`foo`, `bar`, and `baz`), removing all +attributes of the example object: + +```aql +--- +name: aqlUnsetRecursive_6 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, "foo", "bar", "baz") +``` + +--- + +`UNSET_RECURSIVE(document, attributeNameArray) → doc` + +- **document** (object): a document / object +- **attributeNameArray** (array): an array of attribute names as strings +- returns **doc** (object): *document* without the specified attributes at + all levels (top-level as well as nested objects) + +**Examples** + +Recursively remove `baz` attributes, by passing an array with the attribute key: + +```aql +--- +name: aqlUnsetRecursive_7 +description: '' +--- +LET doc = { foo: { bar: { foo: 1, baz: 2 }, baz: 3 }, baz: 4 } +RETURN UNSET_RECURSIVE(doc, ["baz"]) +``` + +## VALUE() + +`VALUE(document, path) → value` + +Return the specified attribute value of the `document`. + +- **document** (object): a document / object +- **path** (array): an array of strings and numbers that describes the + attribute path. You can select object keys with strings and array elements + with numbers. +- returns **value** (any): the selected value of `document` + +**Examples** + +Dynamically get the inner string, like `obj.foo.bar` would: + +```aql +--- +name: aqlValue_1 +description: '' +--- +LET obj = { foo: { bar: "baz" } } +RETURN VALUE(obj, ["foo", "bar"]) +``` + +Dynamically get the inner object of the second array element of a top-level +attribute, like `obj.foo[1].bar` would: + +```aql +--- +name: aqlValue_2 +description: '' +--- +LET obj = { foo: [ { bar: "baz" }, { bar: { inner: true } } ] } +RETURN VALUE(obj, ["foo", 1, "bar"]) +``` + +## VALUES() + +`VALUES(document, removeSystemAttrs) → anyArray` + +Return the attribute values of the `document` as an array. Optionally omit +system attributes. + +To return the attribute keys instead, see the [`ATTRIBUTES()` function](#attributes). + +- **document** (object): a document / object +- **removeSystemAttrs** (bool, *optional*): if set to `true`, then all + system attributes (starting with an underscore, such as `_id`, `_key` etc.) + are removed from the result +- returns **anyArray** (array): the values of `document` returned in any order + +**Examples** + +Get the attribute values of an object: + +```aql +--- +name: aqlValues_1 +description: '' +--- +RETURN VALUES( { "_id": "users/jane", "name": "Jane", "age": 35 } ) +``` + +Get the attribute values of an object, omitting system attributes: + +```aql +--- +name: aqlValues_2 +description: '' +--- +RETURN VALUES( { "_id": "users/jane", "name": "Jane", "age": 35 }, true ) +``` + +## ZIP() + +`ZIP(keys, values) → doc` + +Return a document object assembled from the separate parameters `keys` and `values`. + +`keys` and `values` must be arrays and have the same length. + +- **keys** (array): an array of strings, to be used as attribute names in the result +- **values** (array): an array with elements of arbitrary types, to be used as + attribute values +- returns **doc** (object): a document with the keys and values assembled + +**Examples** + +```aql +--- +name: aqlZip +description: '' +--- +RETURN ZIP( [ "name", "active", "hobbies" ], [ "some user", true, [ "swimming", "riding" ] ] ) +``` diff --git a/site/content/arangodb/oem/aql/functions/fulltext.md b/site/content/arangodb/oem/aql/functions/fulltext.md new file mode 100644 index 0000000000..54a0cd35bc --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/fulltext.md @@ -0,0 +1,94 @@ +--- +title: Fulltext functions in AQL +menuTitle: Fulltext +weight: 30 +description: >- + AQL offers functions to filter data using fulltext indexes +--- +See [fulltext indexes](../../index-and-search/indexing/working-with-indexes/fulltext-indexes.md) +for details. + +{{< warning >}} +The fulltext index type is deprecated from version 3.10 onwards. +It is recommended to use [Inverted indexes](../../index-and-search/indexing/working-with-indexes/inverted-indexes.md) or +[ArangoSearch](../../index-and-search/arangosearch/_index.md) for advanced full-text search capabilities. +{{< /warning >}} + +## FULLTEXT() + +`FULLTEXT(coll, attribute, query, limit) → docArray` + +Return all documents from collection *coll*, for which the attribute *attribute* +matches the fulltext search phrase *query*, optionally capped to *limit* results. + +**Note**: the `FULLTEXT()` function requires the collection *coll* to have a +fulltext index on *attribute*. If no fulltext index is available, this function +will fail with an error at runtime. It doesn't fail when explaining the query however. + +- **coll** (collection): a collection +- **attribute** (string): the attribute name of the attribute to search in +- **query** (string): a fulltext search expression as described below +- **limit** (number, *optional*): if set to a non-zero value, it will cap the result + to at most this number of documents +- returns **docArray** (array): an array of documents + +`FULLTEXT()` is not meant to be used as an argument to `FILTER`, +but rather to be used as the expression of a `FOR` statement: + +```aql +FOR oneMail IN FULLTEXT(emails, "body", "banana,-apple") + RETURN oneMail._id +``` + +*query* is a comma-separated list of sought words (or prefixes of sought words). To +distinguish between prefix searches and complete-match searches, each word can optionally be +prefixed with either the `prefix:` or `complete:` qualifier. Different qualifiers can +be mixed in the same query. Not specifying a qualifier for a search word will implicitly +execute a complete-match search for the given word: + +- `FULLTEXT(emails, "body", "banana")`\ + Will look for the word *banana* in the + attribute *body* of the collection *collection*. + +- `FULLTEXT(emails, "body", "banana,orange")`\ + Will look for both words + *banana* and *orange* in the mentioned attribute. Only those documents will be + returned that contain both words. + +- `FULLTEXT(emails, "body", "prefix:head")`\ + Will look for documents that contain any + words starting with the prefix *head*. + +- `FULLTEXT(emails, "body", "prefix:head,complete:aspirin")`\ + Will look for all + documents that contain a word starting with the prefix *head* and that also contain + the (complete) word *aspirin*. Note: specifying `complete:` is optional here. + +- `FULLTEXT(emails, "body", "prefix:cent,prefix:subst")`\ + Will look for all documents + that contain a word starting with the prefix *cent* and that also contain a word + starting with the prefix *subst*. + +If multiple search words (or prefixes) are given, then by default the results will be +AND-combined, meaning only the logical intersection of all searches will be returned. +It is also possible to combine partial results with a logical OR, and with a logical NOT: + +- `FULLTEXT(emails, "body", "+this,+text,+document")`\ + Will return all documents that + contain all the mentioned words. Note: specifying the `+` symbols is optional here. + +- `FULLTEXT(emails, "body", "banana,|apple")`\ + Will return all documents that contain + either (or both) words *banana* or *apple*. + +- `FULLTEXT(emails, "body", "banana,-apple")`\ + Will return all documents that contain + the word *banana*, but do not contain the word *apple*. + +- `FULLTEXT(emails, "body", "banana,pear,-cranberry")`\ + Will return all documents that + contain both the words *banana* and *pear*, but do not contain the word + *cranberry*. + +No precedence of logical operators will be honored in a fulltext query. The query will simply +be evaluated from left to right. diff --git a/site/content/arangodb/oem/aql/functions/geo.md b/site/content/arangodb/oem/aql/functions/geo.md new file mode 100644 index 0000000000..cf5b3f8a2d --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/geo.md @@ -0,0 +1,964 @@ +--- +title: Geo-spatial functions in AQL +menuTitle: Geo +weight: 35 +description: >- + AQL supports functions for geo-spatial queries and a subset of calls can be + accelerated by geo-spatial indexes +--- +## Geo-spatial data representations + +You can model geo-spatial information in different ways using the data types +available in ArangoDB. The recommended way is to use objects with **GeoJSON** +geometry but you can also use **longitude and latitude coordinate pairs** +for points. Both models are supported by +[Geo-Spatial Indexes](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md). + +### Coordinate pairs + +Longitude and latitude coordinates are numeric values and can be stored in the +following ways: + +- Coordinates using an array with two numbers in `[longitude, latitude]` order, + for example, in a user-chosen attribute called `location`: + + ```json + { + "location": [ -73.983, 40.764 ] + } + ``` + +- Coordinates using an array with two numbers in `[latitude, longitude]` order, + for example, in a user-chosen attribute called `location`: + + ```json + { + "location": [ 40.764, -73.983 ] + } + ``` + +- Coordinates using two separate numeric attributes, for example, in two + user-chosen attributes called `lat` and `lng` as sub-attributes of a `location` + attribute: + + ```json + { + "location": { + "lat": 40.764, + "lng": -73.983 + } + } + ``` + +### GeoJSON + +GeoJSON is a geospatial data format based on JSON. It defines several different +types of JSON objects and the way in which they can be combined to represent +data about geographic shapes on the Earth surface. + +Example of a document with a GeoJSON Point stored in a user-chosen attribute +called `location` (with coordinates in `[longitude, latitude]` order): + +```json +{ + "location": { + "type": "Point", + "coordinates": [ -73.983, 40.764 ] + } +} +``` + +GeoJSON uses a geographic coordinate reference system, +World Geodetic System 1984 (WGS 84), and units of decimal degrees. + +Internally ArangoDB maps all coordinate pairs onto a unit sphere. Distances are +projected onto a sphere with the Earth's *Volumetric mean radius* of *6371 +km*. ArangoDB implements a useful subset of the GeoJSON format +[(RFC 7946)](https://tools.ietf.org/html/rfc7946). +Feature Objects and the GeometryCollection type are not supported. +Supported geometry object types are: + +- Point +- MultiPoint +- LineString +- MultiLineString +- Polygon +- MultiPolygon + +#### Point + +A [GeoJSON Point](https://tools.ietf.org/html/rfc7946#section-3.1.2) is a +[position](https://tools.ietf.org/html/rfc7946#section-3.1.1) comprised of +a longitude and a latitude: + +```json +{ + "type": "Point", + "coordinates": [100.0, 0.0] +} +``` + +#### MultiPoint + +A [GeoJSON MultiPoint](https://tools.ietf.org/html/rfc7946#section-3.1.7) is +an array of positions: + +```json +{ + "type": "MultiPoint", + "coordinates": [ + [100.0, 0.0], + [101.0, 1.0] + ] +} +``` + +#### LineString + +A [GeoJSON LineString](https://tools.ietf.org/html/rfc7946#section-3.1.4) is +an array of two or more positions: + +```json +{ + "type": "LineString", + "coordinates": [ + [100.0, 0.0], + [101.0, 1.0] + ] +} +``` + +#### MultiLineString + +A [GeoJSON MultiLineString](https://tools.ietf.org/html/rfc7946#section-3.1.5) is +an array of LineString coordinate arrays: + +```json +{ + "type": "MultiLineString", + "coordinates": [ + [ + [100.0, 0.0], + [101.0, 1.0] + ], + [ + [102.0, 2.0], + [103.0, 3.0] + ] + ] +} +``` + +#### Polygon + +A [GeoJSON Polygon](https://tools.ietf.org/html/rfc7946#section-3.1.6) consists +of a series of closed `LineString` objects (ring-like). These *Linear Ring* +objects consist of four or more coordinate pairs with the first and last +coordinate pair being equal. Coordinate pairs of a Polygon are an array of +linear ring coordinate arrays. The first element in the array represents +the exterior ring. Any subsequent elements represent interior rings +(holes within the surface). + +The orientation of the first linear ring is crucial: the right-hand-rule +is applied, so that the area to the left of the path of the linear ring +(when walking on the surface of the Earth) is considered to be the +"interior" of the polygon. All other linear rings must be contained +within this interior. According to the GeoJSON standard, the subsequent +linear rings must be oriented following the right-hand-rule, too, +that is, they must run **clockwise** around the hole (viewed from +above). However, ArangoDB is tolerant here (as suggested by the +[GeoJSON standard](https://datatracker.ietf.org/doc/html/rfc7946#section-3.1.6)), +all but the first linear ring are inverted if the orientation is wrong. + +In the end, a point is considered to be in the interior of the polygon, +if and only if one has to cross an odd number of linear rings to reach the +exterior of the polygon prescribed by the first linear ring. + +A number of additional rules apply (and are enforced by the GeoJSON +parser): + +- A polygon must contain at least one linear ring, i.e., it must not be + empty. +- A linear ring may not be empty, it needs at least three _distinct_ + coordinate pairs, that is, at least 4 coordinate pairs (since the first and + last must be the same). +- No two edges of linear rings in the polygon must intersect, in + particular, no linear ring may be self-intersecting. +- Within the same linear ring, consecutive coordinate pairs may be the same, + otherwise all coordinate pairs need to be distinct (except the first and last one). +- Linear rings of a polygon must not share edges, but they may share coordinate pairs. +- A linear ring defines two regions on the sphere. ArangoDB always + interprets the region that lies to the left of the boundary ring (in + the direction of its travel on the surface of the Earth) as the + interior of the ring. This is in contrast to earlier versions of + ArangoDB before 3.10, which always took the **smaller** of the two + regions as the interior. Therefore, from 3.10 on one can now have + polygons whose outer ring encloses more than half the Earth's surface. +- The interior rings must be contained in the (interior) of the outer ring. +- Interior rings should follow the above rule for orientation + (counterclockwise external rings, clockwise internal rings, interior + always to the left of the line). + +Here is an example with no holes: + +```json +{ + "type": "Polygon", + "coordinates": [ + [ + [100.0, 0.0], + [101.0, 0.0], + [101.0, 1.0], + [100.0, 1.0], + [100.0, 0.0] + ] + ] +} +``` + +Here is an example with a hole: + +```json +{ + "type": "Polygon", + "coordinates": [ + [ + [100.0, 0.0], + [101.0, 0.0], + [101.0, 1.0], + [100.0, 1.0], + [100.0, 0.0] + ], + [ + [100.8, 0.8], + [100.8, 0.2], + [100.2, 0.2], + [100.2, 0.8], + [100.8, 0.8] + ] + ] +} +``` + +#### MultiPolygon + +A [GeoJSON MultiPolygon](https://tools.ietf.org/html/rfc7946#section-3.1.6) consists +of multiple polygons. The "coordinates" member is an array of +_Polygon_ coordinate arrays. See [above](#polygon) for the rules and +the meaning of polygons. + +If the polygons in a MultiPolygon are disjoint, then a point is in the +interior of the MultiPolygon if and only if it is +contained in one of the polygons. If some polygon P2 in a MultiPolygon +is contained in another polygon P1, then P2 is treated like a hole +in P1 and containment of points is defined with the even-odd-crossings rule +(see [Polygon](#polygon)). + +Additionally, the following rules apply and are enforced for +MultiPolygons: + +- No two edges in the linear rings of the polygons of a MultiPolygon + may intersect. +- Polygons in the same MultiPolygon may not share edges, but they may share + coordinate pairs. + +Example with two polygons, the second one with a hole: + +```json +{ + "type": "MultiPolygon", + "coordinates": [ + [ + [ + [102.0, 2.0], + [103.0, 2.0], + [103.0, 3.0], + [102.0, 3.0], + [102.0, 2.0] + ] + ], + [ + [ + [100.0, 0.0], + [101.0, 0.0], + [101.0, 1.0], + [100.0, 1.0], + [100.0, 0.0] + ], + [ + [100.2, 0.2], + [100.2, 0.8], + [100.8, 0.8], + [100.8, 0.2], + [100.2, 0.2] + ] + ] + ] +} +``` + +## GeoJSON interpretation + +Note the following technical detail about GeoJSON: The +[GeoJSON standard, Section 3.1.1 Position](https://datatracker.ietf.org/doc/html/rfc7946#section-3.1.1) +prescribes that lines are **cartesian lines in cylindrical coordinates** +(longitude/latitude). However, this definition is inconvenient in practice, +since such lines are not geodesic on the surface of the Earth. +Furthermore, the best available algorithms for geospatial computations on Earth +typically use geodesic lines as the boundaries of polygons on Earth. + +Therefore, ArangoDB uses the **syntax of the GeoJSON** standard, +but then interprets lines (and boundaries of polygons) as +**geodesic lines (pieces of great circles) on Earth**. This is a +violation of the GeoJSON standard, but serving a practical purpose. + +Note in particular that this can sometimes lead to unexpected results. +Consider the following polygon (remember that GeoJSON has +**longitude before latitude** in coordinate pairs): + +```json +{ "type": "Polygon", "coordinates": [[ + [4, 54], [4, 47], [16, 47], [16, 54], [4, 54] +]] } +``` + +![GeoJSON Polygon Geodesic](../../../../images/geojson-polygon-geodesic.webp) + +It does not contain the point `[10, 47]` since the shortest path (geodesic) +from `[4, 47]` to `[16, 47]` lies North relative to the parallel of latitude at +47 degrees. On the contrary, the polygon does contain the point `[10, 54]` as it +lies South of the parallel of latitude at 54 degrees. + +{{< info >}} +ArangoDB version before 3.10 did an inconsistent special detection of "rectangle" +polygons that later versions from 3.10 onward no longer do, see +[Legacy Polygons](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md#legacy-polygons). +{{< /info >}} + +Furthermore, there is an issue with the interpretation of linear rings +(boundaries of polygons) according to +[GeoJSON standard, Section 3.1.6 Polygon](https://datatracker.ietf.org/doc/html/rfc7946#section-3.1.6). +This section states explicitly: + +> A linear ring MUST follow the right-hand rule with respect to the +> area it bounds, i.e., exterior rings are counter-clockwise, and +> holes are clockwise. + +This rather misleading phrase means that when a linear ring is used as +the boundary of a polygon, the "interior" of the polygon lies **to the left** +of the boundary when one travels on the surface of the Earth and +along the linear ring. For +example, the polygon below travels **counter-clockwise** around the point +`[10, 50]`, and thus the interior of the polygon contains this point and +its surroundings, but not, for example, the North Pole and the South +Pole. + +```json +{ "type": "Polygon", "coordinates": [[ + [4, 54], [4, 47], [16, 47], [16, 54], [4, 54] +]] } +``` + +![GeoJSON Polygon Counter-clockwise](../../../../images/geojson-polygon-ccw.webp) + +On the other hand, the following polygon travels **clockwise** around the point +`[10, 50]`, and thus its "interior" does not contain `[10, 50]`, but does +contain the North Pole and the South Pole: + +```json +{ "type": "Polygon", "coordinates": [[ + [4, 54], [16, 54], [16, 47], [4, 47], [4, 54] +]] } +``` + +![GeoJSON Polygon Clockwise](../../../../images/geojson-polygon-cw.webp) + +Remember that the "interior" is to the left of the given +linear ring, so this second polygon is basically the complement on Earth +of the previous polygon! + +ArangoDB versions before 3.10 did not follow this rule and always took the +"smaller" connected component of the surface as the "interior" of the polygon. +This made it impossible to specify polygons which covered more than half of the +sphere. From version 3.10 onward, ArangoDB recognizes this correctly. +See [Legacy Polygons](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md#legacy-polygons) +for how to deal with this issue. + +## Geo utility functions + +The following helper functions **can** use geo indexes, but do not have to in +all cases. You can use all of these functions in combination with each other, +and if you have configured a geo index it may be utilized, +see [Geo Indexing](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md). + +### DISTANCE() + +`DISTANCE(latitude1, longitude1, latitude2, longitude2) → distance` + +Calculate the distance between two arbitrary points in meters (as birds +would fly). The value is computed using the haversine formula, which is based +on a spherical Earth model. It's fast to compute and is accurate to around 0.3%, +which is sufficient for most use cases such as location-aware services. + +- **latitude1** (number): the latitude of the first point +- **longitude1** (number): the longitude of the first point +- **latitude2** (number): the latitude of the second point +- **longitude2** (number): the longitude of the second point +- returns **distance** (number): the distance between both points in **meters** + +```aql +// Distance from Brandenburg Gate (Berlin) to ArangoDB headquarters (Cologne) +DISTANCE(52.5163, 13.3777, 50.9322, 6.94) // 476918.89688380965 (~477km) + +// Sort a small number of documents based on distance to Central Park (New York) +FOR doc IN coll // e.g. documents returned by a traversal + SORT DISTANCE(doc.latitude, doc.longitude, 40.78, -73.97) + RETURN doc +``` + +### GEO_CONTAINS() + +`GEO_CONTAINS(geoJsonA, geoJsonB) → bool` + +Checks whether the [GeoJSON object](#geojson) `geoJsonA` +fully contains `geoJsonB` (every point in B is also in A). The object `geoJsonA` +has to be of type _Polygon_ or _MultiPolygon_. For other types containment is +not well-defined because of numerical stability problems. + +- **geoJsonA** (object): first GeoJSON object +- **geoJsonB** (object): second GeoJSON object, or a coordinate array in + `[longitude, latitude]` order +- returns **bool** (bool): `true` if every point in B is also contained in A, + otherwise `false` + +{{< info >}} +ArangoDB follows and exposes the same behavior as the underlying +S2 geometry library. As stated in the S2 documentation: + +> Point containment is defined such that if the sphere is subdivided +> into faces (loops), every point is contained by exactly one face. +> This implies that linear rings do not necessarily contain their vertices. + +As a consequence, a linear ring or polygon does not necessarily contain its +boundary edges! +{{< /info >}} + +You can optimize queries that contain a `FILTER` expression of the following +form with an S2-based [geospatial index](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md): + +```aql +FOR doc IN coll + FILTER GEO_CONTAINS(geoJson, doc.geo) + ... +``` + +In this example, you would create the index for the collection `coll`, on the +attribute `geo`. You need to set the `geoJson` index option to `true`. +The `geoJson` variable needs to evaluate to a valid GeoJSON object. Also note +the argument order: the stored document attribute `doc.geo` is passed as the +second argument. Passing it as the first argument, like +`FILTER GEO_CONTAINS(doc.geo, geoJson)` to test whether `doc.geo` contains +`geoJson`, cannot utilize the index. + +### GEO_DISTANCE() + +`GEO_DISTANCE(geoJsonA, geoJsonB, ellipsoid) → distance` + +Return the distance between two GeoJSON objects in meters, measured from the +**centroid** of each shape. For a list of supported types see the +[geo index page](#geojson). + +- **geoJsonA** (object): first GeoJSON object, or a coordinate array in + `[longitude, latitude]` order +- **geoJsonB** (object): second GeoJSON object, or a coordinate array in + `[longitude, latitude]` order +- **ellipsoid** (string, *optional*): reference ellipsoid to use. + Supported are `"sphere"` (default) and `"wgs84"`. +- returns **distance** (number): the distance between the centroid points of + the two objects on the reference ellipsoid in **meters** + +```aql +LET polygon = { + type: "Polygon", + coordinates: [[[-11.5, 23.5], [-10.5, 26.1], [-11.2, 27.1], [-11.5, 23.5]]] +} +FOR doc IN collectionName + LET distance = GEO_DISTANCE(doc.geometry, polygon) // calculates the distance + RETURN distance +``` + +You can optimize queries that contain a `FILTER` expression of the following +form with an S2-based [geospatial index](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md): + +```aql +FOR doc IN coll + FILTER GEO_DISTANCE(geoJson, doc.geo) <= limit + ... +``` + +In this example, you would create the index for the collection `coll`, on the +attribute `geo`. You need to set the `geoJson` index option to `true`. +`geoJson` needs to evaluate to a valid GeoJSON object. `limit` must be a +distance in meters; it cannot be an expression. An upper bound with `<`, +a lower bound with `>` or `>=`, or both, are equally supported. + +You can also optimize queries that use a `SORT` condition of the following form +with a geospatial index: + +```aql + SORT GEO_DISTANCE(geoJson, doc.geo) +``` + +The index covers returning matches from closest to furthest away, or vice versa. +You may combine such a `SORT` with a `FILTER` expression that utilizes the +geospatial index, too, via the [`GEO_DISTANCE()`](#geo_distance), +[`GEO_CONTAINS()`](#geo_contains), and [`GEO_INTERSECTS()`](#geo_intersects) +functions. + +### GEO_AREA() + +`GEO_AREA(geoJson, ellipsoid) → area` + +Return the area for a [Polygon](#polygon) or [MultiPolygon](#multipolygon) +on a sphere with the average Earth radius, or an ellipsoid. + +- **geoJson** (object): a GeoJSON object +- **ellipsoid** (string, *optional*): reference ellipsoid to use. + Supported are `"sphere"` (default) and `"wgs84"`. +- returns **area** (number): the area of the polygon in **square meters** + +```aql +LET polygon = { + type: "Polygon", + coordinates: [[[-11.5, 23.5], [-10.5, 26.1], [-11.2, 27.1], [-11.5, 23.5]]] +} +RETURN GEO_AREA(polygon, "wgs84") +``` + +### GEO_EQUALS() + +`GEO_EQUALS(geoJsonA, geoJsonB) → bool` + +Checks whether two [GeoJSON objects](#geojson) are equal or not. + +- **geoJsonA** (object): first GeoJSON object. +- **geoJsonB** (object): second GeoJSON object. +- returns **bool** (bool): `true` if they are equal, otherwise `false`. + +```aql +LET polygonA = GEO_POLYGON([ + [-11.5, 23.5], [-10.5, 26.1], [-11.2, 27.1], [-11.5, 23.5] +]) +LET polygonB = GEO_POLYGON([ + [-11.5, 23.5], [-10.5, 26.1], [-11.2, 27.1], [-11.5, 23.5] +]) +RETURN GEO_EQUALS(polygonA, polygonB) // true +``` + +```aql +LET polygonA = GEO_POLYGON([ + [-11.1, 24.0], [-10.5, 26.1], [-11.2, 27.1], [-11.1, 24.0] +]) +LET polygonB = GEO_POLYGON([ + [-11.5, 23.5], [-10.5, 26.1], [-11.2, 27.1], [-11.5, 23.5] +]) +RETURN GEO_EQUALS(polygonA, polygonB) // false +``` + +### GEO_INTERSECTS() + +`GEO_INTERSECTS(geoJsonA, geoJsonB) → bool` + +Checks whether the [GeoJSON object](#geojson) `geoJsonA` +intersects with `geoJsonB` (i.e. at least one point in B is also in A or vice-versa). + +- **geoJsonA** (object): first GeoJSON object +- **geoJsonB** (object): second GeoJSON object, or a coordinate array in + `[longitude, latitude]` order +- returns **bool** (bool): true if B intersects A, false otherwise + +You can optimize queries that contain a `FILTER` expression of the following +form with an S2-based [geospatial index](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md): + +```aql +FOR doc IN coll + FILTER GEO_INTERSECTS(geoJson, doc.geo) + ... +``` + +In this example, you would create the index for the collection `coll`, on the +attribute `geo`. You need to set the `geoJson` index option to `true`. +`geoJson` needs to evaluate to a valid GeoJSON object. Also note +the argument order: the stored document attribute `doc.geo` is passed as the +second argument. Passing it as the first argument, like +`FILTER GEO_INTERSECTS(doc.geo, geoJson)` to test whether `doc.geo` intersects +`geoJson`, cannot utilize the index. + +### GEO_IN_RANGE() + +Introduced in: v3.8.0 + +`GEO_IN_RANGE(geoJsonA, geoJsonB, low, high, includeLow, includeHigh) → bool` + +Checks whether the distance between two [GeoJSON objects](#geojson) +lies within a given interval. The distance is measured from the **centroid** of +each shape. + +- **geoJsonA** (object\|array): first GeoJSON object, or a coordinate array + in `[longitude, latitude]` order +- **geoJsonB** (object\|array): second GeoJSON object, or a coordinate array + in `[longitude, latitude]` order +- **low** (number): minimum value of the desired range +- **high** (number): maximum value of the desired range +- **includeLow** (bool, optional): whether the minimum value shall be included + in the range (left-closed interval) or not (left-open interval). The default + value is `true` +- **includeHigh** (bool): whether the maximum value shall be included in the + range (right-closed interval) or not (right-open interval). The default value + is `true` +- returns **bool** (bool): whether the evaluated distance lies within the range + +### IS_IN_POLYGON() + +Determine whether a point is inside a polygon. + +{{< warning >}} +The `IS_IN_POLYGON()` AQL function is **deprecated** as of ArangoDB 3.4.0 in +favor of the new [`GEO_CONTAINS()` AQL function](#geo_contains), which works with +[GeoJSON](https://tools.ietf.org/html/rfc7946) Polygons and MultiPolygons. +{{< /warning >}} + +`IS_IN_POLYGON(polygon, latitude, longitude) → bool` + +- **polygon** (array): an array of arrays with 2 elements each, representing the + points of the polygon in the format `[latitude, longitude]` +- **latitude** (number): the latitude of the point to search +- **longitude** (number): the longitude of the point to search +- returns **bool** (bool): `true` if the point (`[latitude, longitude]`) is + inside the `polygon` or `false` if it's not. The result is undefined (can be + `true` or `false`) if the specified point is exactly on a boundary of the + polygon. + +```aql +// checks if the point (latitude 4, longitude 7) is contained inside the polygon +IS_IN_POLYGON( [ [ 0, 0 ], [ 0, 10 ], [ 10, 10 ], [ 10, 0 ] ], 4, 7 ) +``` + +--- + +`IS_IN_POLYGON(polygon, coord, useLonLat) → bool` + +The 2nd parameter can alternatively be specified as an array with two values. + +By default, each array element in `polygon` is expected to be in the format +`[latitude, longitude]`. This can be changed by setting the 3rd parameter to `true` to +interpret the points as `[longitude, latitude]`. `coord` is then also interpreted in +the same way. + +- **polygon** (array): an array of arrays with 2 elements each, representing the + points of the polygon +- **coord** (array): the point to search as a numeric array with two elements +- **useLonLat** (bool, *optional*): if set to `true`, the coordinates in + `polygon` and the coordinate pair `coord` are interpreted as + `[longitude, latitude]` (like in GeoJSON). The default is `false` and the + format `[latitude, longitude]` is expected. +- returns **bool** (bool): `true` if the point `coord` is inside the `polygon` + or `false` if it's not. The result is undefined (can be `true` or `false`) if + the specified point is exactly on a boundary of the polygon. + +```aql +// checks if the point (lat 4, lon 7) is contained inside the polygon +IS_IN_POLYGON( [ [ 0, 0 ], [ 0, 10 ], [ 10, 10 ], [ 10, 0 ] ], [ 4, 7 ] ) + +// checks if the point (lat 4, lon 7) is contained inside the polygon +IS_IN_POLYGON( [ [ 0, 0 ], [ 10, 0 ], [ 10, 10 ], [ 0, 10 ] ], [ 7, 4 ], true ) +``` + +## GeoJSON Constructors + +The following helper functions are available to easily create valid GeoJSON +output. In all cases you can write equivalent JSON yourself, but these functions +will help you to make all your AQL queries shorter and easier to read. + +### GEO_LINESTRING() + +`GEO_LINESTRING(points) → geoJson` + +Construct a GeoJSON LineString. +Needs at least two longitude/latitude pairs. + +- **points** (array): an array of `[longitude, latitude]` pairs +- returns **geoJson** (object): a valid GeoJSON LineString + +```aql +--- +name: aqlGeoLineString_1 +description: '' +--- +RETURN GEO_LINESTRING([ + [35, 10], [45, 45] +]) +``` + +### GEO_MULTILINESTRING() + +`GEO_MULTILINESTRING(points) → geoJson` + +Construct a GeoJSON MultiLineString. +Needs at least two elements consisting valid LineStrings coordinate arrays. + +- **points** (array): array of LineStrings +- returns **geoJson** (object): a valid GeoJSON MultiLineString + +```aql +--- +name: aqlGeoMultiLineString_1 +description: '' +--- +RETURN GEO_MULTILINESTRING([ + [[100.0, 0.0], [101.0, 1.0]], + [[102.0, 2.0], [101.0, 2.3]] +]) +``` + +### GEO_MULTIPOINT() + +`GEO_MULTIPOINT(points) → geoJson` + +Construct a GeoJSON LineString. Needs at least two longitude/latitude pairs. + +- **points** (array): an array of `[longitude, latitude]` pairs +- returns **geoJson** (object): a valid GeoJSON Point + +```aql +--- +name: aqlGeoMultiPoint_1 +description: '' +--- +RETURN GEO_MULTIPOINT([ + [35, 10], [45, 45] +]) +``` + +### GEO_POINT() + +`GEO_POINT(longitude, latitude) → geoJson` + +Construct a valid GeoJSON Point. + +- **longitude** (number): the longitude portion of the point +- **latitude** (number): the latitude portion of the point +- returns **geoJson** (object): a GeoJSON Point + +```aql +--- +name: aqlGeoPoint_1 +description: '' +--- +RETURN GEO_POINT(1.0, 2.0) +``` + +### GEO_POLYGON() + +`GEO_POLYGON(points) → geoJson` + +Construct a GeoJSON Polygon. Needs at least one array representing +a linear ring. Each linear ring consists of an array with at least four +longitude/latitude pairs. The first linear ring must be the outermost, while +any subsequent linear ring will be interpreted as holes. + +For details about the rules, see [GeoJSON polygons](#polygon). + +- **points** (array): an array of (arrays of) `[longitude, latitude]` pairs +- returns **geoJson** (object\|null): a valid GeoJSON Polygon + +A validation step is performed using the S2 geometry library. If the +validation is not successful, an AQL warning is issued and `null` is +returned. + +Simple Polygon: + +```aql +--- +name: aqlGeoPolygon_1 +description: '' +--- +RETURN GEO_POLYGON([ + [0.0, 0.0], [7.5, 2.5], [0.0, 5.0], [0.0, 0.0] +]) +``` + +Advanced Polygon with a hole inside: + +```aql +--- +name: aqlGeoPolygon_2 +description: '' +--- +RETURN GEO_POLYGON([ + [[35, 10], [45, 45], [15, 40], [10, 20], [35, 10]], + [[20, 30], [30, 20], [35, 35], [20, 30]] +]) +``` + +### GEO_MULTIPOLYGON() + +`GEO_MULTIPOLYGON(polygons) → geoJson` + +Construct a GeoJSON MultiPolygon. Needs at least two Polygons inside. +See [`GEO_POLYGON()`](#geo_polygon) and [GeoJSON MultiPolygon](#multipolygon) +for the rules of Polygon and MultiPolygon construction. + +- **polygons** (array): an array of arrays of arrays of `[longitude, latitude]` pairs +- returns **geoJson** (object\|null): a valid GeoJSON MultiPolygon + +A validation step is performed using the S2 geometry library, if the +validation is not successful, an AQL warning is issued and `null` is +returned. + +MultiPolygon comprised of a simple Polygon and a Polygon with hole: + +```aql +--- +name: aqlGeoMultiPolygon_1 +description: '' +--- +RETURN GEO_MULTIPOLYGON([ + [ + [[40, 40], [20, 45], [45, 30], [40, 40]] + ], + [ + [[20, 35], [10, 30], [10, 10], [30, 5], [45, 20], [20, 35]], + [[30, 20], [20, 15], [20, 25], [30, 20]] + ] +]) +``` + +## Geo Index Functions + +{{< warning >}} +The AQL functions `NEAR()`, `WITHIN()` and `WITHIN_RECTANGLE()` are +deprecated starting from version 3.4.0. +Please use the [Geo utility functions](#geo-utility-functions) instead. +{{< /warning >}} + +AQL offers the following functions to filter data based on +[geo indexes](../../index-and-search/indexing/working-with-indexes/geo-spatial-indexes.md). These functions require the collection +to have at least one geo index. If no geo index can be found, calling this +function will fail with an error at runtime. There is no error when explaining +the query however. + +### NEAR() + +{{< warning >}} +`NEAR()` is a deprecated AQL function from version 3.4.0 on. +Use [`DISTANCE()`](#distance) in a query like this instead: + +```aql +FOR doc IN coll + SORT DISTANCE(doc.latitude, doc.longitude, paramLatitude, paramLongitude) ASC + RETURN doc +``` +Assuming there exists a geo-type index on `latitude` and `longitude`, the +optimizer will recognize it and accelerate the query. +{{< /warning >}} + +`NEAR(coll, latitude, longitude, limit, distanceName) → docArray` + +Return at most *limit* documents from collection *coll* that are near +*latitude* and *longitude*. The result contains at most *limit* documents, +returned sorted by distance, with closest distances being returned first. +Optionally, the distances in meters between the specified coordinate pair +(*latitude* and *longitude*) and the stored coordinate pairs can be returned as +well. To make use of that, the desired attribute name for the distance result +has to be specified in the *distanceName* argument. The result documents will +contain the distance value in an attribute of that name. + +- **coll** (collection): a collection +- **latitude** (number): the latitude of the point to search +- **longitude** (number): the longitude of the point to search +- **limit** (number, *optional*): cap the result to at most this number of + documents. The default is 100. If more documents than *limit* are found, + it is undefined which ones will be returned. +- **distanceName** (string, *optional*): include the distance (in meters) + between the reference point and the stored point in the result, using the + attribute name *distanceName* +- returns **docArray** (array): an array of documents, sorted by distance + (shortest distance first) + +### WITHIN() + +{{< warning >}} +`WITHIN()` is a deprecated AQL function from version 3.4.0 on. +Use [`DISTANCE()`](#distance) in a query like this instead: + +```aql +FOR doc IN coll + LET d = DISTANCE(doc.latitude, doc.longitude, paramLatitude, paramLongitude) + FILTER d <= radius + SORT d ASC + RETURN doc +``` + +Assuming there exists a geo-type index on `latitude` and `longitude`, the +optimizer will recognize it and accelerate the query. +{{< /warning >}} + +`WITHIN(coll, latitude, longitude, radius, distanceName) → docArray` + +Return all documents from collection *coll* that are within a radius of *radius* +around the specified coordinate pair (*latitude* and *longitude*). The documents +returned are sorted by distance to the reference point, with the closest +distances being returned first. Optionally, the distance (in meters) between the +reference point and the stored point can be returned as well. To make +use of that, an attribute name for the distance result has to be specified in +the *distanceName* argument. The result documents will contain the distance +value in an attribute of that name. + +- **coll** (collection): a collection +- **latitude** (number): the latitude of the point to search +- **longitude** (number): the longitude of the point to search +- **radius** (number): radius in meters +- **distanceName** (string, *optional*): include the distance (in meters) + between the reference point and stored point in the result, using the + attribute name *distanceName* +- returns **docArray** (array): an array of documents, sorted by distance + (shortest distance first) + +### WITHIN_RECTANGLE() + +{{< warning >}} +`WITHIN_RECTANGLE()` is a deprecated AQL function from version 3.4.0 on. Use +[`GEO_CONTAINS()`](#geo_contains) and a GeoJSON polygon instead - but note that +this uses geodesic lines from version 3.10.0 onward +(see [GeoJSON interpretation](#geojson-interpretation)): + +```aql +LET rect = GEO_POLYGON([ [ + [longitude1, latitude1], // bottom-left + [longitude2, latitude1], // bottom-right + [longitude2, latitude2], // top-right + [longitude1, latitude2], // top-left + [longitude1, latitude1], // bottom-left +] ]) +FOR doc IN coll + FILTER GEO_CONTAINS(rect, [doc.longitude, doc.latitude]) + RETURN doc +``` + +Assuming there exists a geo-type index on `latitude` and `longitude`, the +optimizer will recognize it and accelerate the query. +{{< /warning >}} + +`WITHIN_RECTANGLE(coll, latitude1, longitude1, latitude2, longitude2) → docArray` + +Return all documents from collection *coll* that are positioned inside the +bounding rectangle with the points (*latitude1*, *longitude1*) and (*latitude2*, +*longitude2*). There is no guaranteed order in which the documents are returned. + +- **coll** (collection): a collection +- **latitude1** (number): the latitude of the bottom-left point to search +- **longitude1** (number): the longitude of the bottom-left point to search +- **latitude2** (number): the latitude of the top-right point to search +- **longitude2** (number): the longitude of the top-right point to search +- returns **docArray** (array): an array of documents, in random order diff --git a/site/content/arangodb/oem/aql/functions/miscellaneous.md b/site/content/arangodb/oem/aql/functions/miscellaneous.md new file mode 100644 index 0000000000..fba18e52bf --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/miscellaneous.md @@ -0,0 +1,803 @@ +--- +title: Miscellaneous functions in AQL +menuTitle: Miscellaneous +weight: 40 +description: >- + AQL functions that do not fall into other categories are listed here +--- +## Control flow functions + +### FIRST_DOCUMENT() + +`FIRST_DOCUMENT(alternative, ...) → doc` + +Return the first alternative that is a document, and *null* if none of the +alternatives is a document. + +- **alternative** (any, *repeatable*): input of arbitrary type +- returns **doc** (object\|null): document / object or null + +### FIRST_LIST() + +`FIRST_LIST(alternative, ...) → list` + +Return the first alternative that is an array, and *null* if none of the +alternatives is an array. + +- **alternative** (any, *repeatable*): input of arbitrary type +- returns **list** (array\|null): array / list or null + +### MIN_MATCH() + +`MIN_MATCH(expr1, ... exprN, minMatchCount) → fulfilled` + +Match documents where at least **minMatchCount** of the specified +AQL expressions are satisfied. + +There is a corresponding [`MIN_MATCH()` ArangoSearch function](arangosearch.md#min_match) +that can utilize View indexes. + +- **expr** (expression, _repeatable_): any valid AQL expression +- **minMatchCount** (number): minimum number of expressions that should + be satisfied +- returns **fulfilled** (bool): whether at least **minMatchCount** of the + specified expressions are `true` + +You can use `MIN_MATCH()` to filter if two out of three conditions evaluate to +`true` for instance: + +```aql +LET members = [ + { name: "Carol", age: 41, active: true }, + { name: "Doug", age: 56, active: true }, +] +FOR doc IN members + FILTER MIN_MATCH(LENGTH(doc.name) == 5, doc.age >= 50, doc.active, 2) + RETURN doc +``` + +An equivalent filter expression without `MIN_MATCH()` would be more cumbersome: + +```aql + FILTER (LENGTH(doc.name) == 5 AND doc.age >= 50) + OR (doc.age >= 50 AND doc.active) + OR (doc.active AND LENGTH(doc.name) == 5) +``` + +### NOT_NULL() + +`NOT_NULL(alternative, ...) → value` + +Return the first element that is not *null*, and *null* if all alternatives +are *null* themselves. It is also known as `COALESCE()` in SQL. + +- **alternative** (any, *repeatable*): input of arbitrary type +- returns **value** (any): first non-null parameter, or *null* if all arguments + are *null* + +### Ternary operator + +For conditional evaluation, check out the +[ternary operator](../operators.md#ternary-operator). + +## Database functions + +### CHECK_DOCUMENT() + +`CHECK_DOCUMENT(document) → checkResult` + +Returns *true* if *document* is a valid document object, i.e. a document +without any duplicate attribute names. Will return *false* for any +non-objects/non-documents or documents with duplicate attribute names. + +{{< warning >}} +This is an internal function for validating database objects and +is not supposed to be useful for anything else. +{{< /warning >}} + +The primary use case for this function is to apply it on all +documents in a given collection as follows: + +```aql +FOR doc IN collection + FILTER !CHECK_DOCUMENT(doc) + RETURN JSON_STRINGIFY(doc) +``` + +This query will return all documents in the given collection with redundant +attribute names and export them. This output can be used for subsequent +cleanup operations. + +{{< info >}} +When using object literals in AQL, there will be an automatic +removal/cleanup of duplicate attribute names, so the function will be effective +only for **already stored** database documents. Therefore, +`RETURN CHECK_DOCUMENT( { a: 1, a: 2 } )` is expected to return `true`. +{{< /info >}} + +- **document** (object): an arbitrary document / object +- returns **checkResult** (bool): *true* for any valid objects/documents without + duplicate attribute names, and *false* for any non-objects/non-documents or + objects/documents with duplicate attribute names + +### COLLECTION_COUNT() + +`COLLECTION_COUNT(coll) → count` + +Determine the amount of documents in a collection. [`LENGTH()`](#length) +is preferred. + +### COLLECTIONS() + +`COLLECTIONS() → docArray` + +Return an array of collections. + +- returns **docArray** (array): each collection as a document with attributes + *name* and *_id* in an array + +### COUNT() + +This is an alias for [`LENGTH()`](#length). + +### CURRENT_DATABASE() + +`CURRENT_DATABASE() → databaseName` + +Returns the name of the current database. + +The current database is the database name that was specified in the URL path of the request (or defaults to _system database). + +- returns **databaseName** (string): the current database name + +### CURRENT_USER() + +`CURRENT_USER() → userName` + +Return the name of the current user. + +The current user is the user account name that was specified in the +*Authorization* HTTP header of the request. It will only be populated if +authentication on the server is turned on, and if the query was executed inside +a request context. Otherwise, the return value of this function will be *null*. + +- returns **userName** (string\|null): the current user name, or *null* if + authentication is disabled + +### DECODE_REV() + +`DECODE_REV(revision) → details` + +Decompose the specified `revision` string into its components. +The resulting object has a `date` and a `count` attribute. +This function is supposed to be called with the `_rev` attribute value +of a database document as argument. + +- **revision** (string): revision ID string +- returns **details** (object\|null): object with two attributes + *date* (string in ISO 8601 format) and *count* (integer number), + or *null* + +If the input revision ID is not a string or cannot be processed, the function +issues a warning and returns *null*. + +Please note that the result structure may change in future versions of +ArangoDB in case the internal format of revision strings is modified. Please +also note that the *date* value in the current result provides the date and +time of when the document record was put together on the server, but not +necessarily the time of insertion into the underlying storage engine. Therefore +in case of concurrent document operations the exact document storage order +cannot be derived unambiguously from the revision value. It should thus be +treated as a rough estimate of when a document was created or last updated. + +```aql +DECODE_REV( "_YU0HOEG---" ) +// { "date" : "2019-03-11T16:15:05.314Z", "count" : 0 } +``` + +### DOCUMENT() + +Dynamically look up one or multiple documents from any collections, either using +a collection name and one or more document keys, or one or more document +identifiers. The collections do not need to be known at query compile time, they +can be computed at runtime. + +{{< info >}} +It is recommended to use subqueries with the [`FOR` operation](../high-level-operations/for.md) +and filters over `DOCUMENT()` whenever the collections are known in advance, +especially for [joins](../examples-and-query-patterns/joins.md), because they perform better, you +can add additional filters, and combine it with sorting to get an array of +documents in a guaranteed order. + +Queries that use the `DOCUMENT()` function cannot be +[**cached**](../execution-and-performance/caching-query-results.md), each lookup is executed as +a single operation, the lookups need to be executed on Coordinators for +sharded collections in cluster deployments, and only primary indexes and no +projections can be utilized. +{{< /info >}} + +`DOCUMENT(collection, id) → doc` + +Return the document identified by `id` (document key or identifier) from the +specified `collection`. + +If the document cannot be found, `null` will be returned. +If there is a mismatch between the `collection` passed and the collection in +the document identifier, then `null` will be returned, too. + +The `id` parameter can also be an array of document keys or identifiers. In this +case, the function will return an array of all documents that could be found. +The results are not guaranteed to be in the requested order. Documents that +could not be found are not indicated in the result (no `null` values) and do +also not raise warnings. + +- **collection** (string): name of a collection +- **id** (string\|array): a document key, a document identifier, or an array of + document keys, identifiers, or both +- returns **doc** (document\|array\|null): the found document (or `null` if it + was not found), or an array of all found documents **in any order** + +**Examples** + +```aql +--- +name: FUNCTION_DOCUMENT_1 +description: '' +dataset: knows_graph +--- +RETURN DOCUMENT( persons, "persons/alice" ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_2 +description: '' +dataset: knows_graph +--- +RETURN DOCUMENT( persons, "alice" ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_3 +description: '' +dataset: knows_graph +--- +RETURN DOCUMENT( persons, [ "persons/alice", "persons/bob" ] ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_4 +description: '' +dataset: knows_graph +--- +RETURN DOCUMENT( persons, [ "alice", "bob" ] ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_5 +description: '' +dataset: knows_graph +bindVars: + { + "@coll": "persons", + "key": "alice" + } +--- +RETURN DOCUMENT( @@coll, @key ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_6 +description: '' +dataset: knows_graph +bindVars: + { + "@coll": "persons", + "keys": ["alice", "bob"] + } +--- +RETURN DOCUMENT( @@coll, @keys ) +``` + +--- + +`DOCUMENT(id) → doc` + +The function can also be used with a single `id` parameter as follows: + +- **id** (string\|array): a document identifier, or an array of identifiers +- returns **doc** (document\|array\|null): the found document (or `null` if it + was not found), or an array of the found documents **in any order** + +**Examples** + +```aql +--- +name: FUNCTION_DOCUMENT_7 +description: '' +dataset: knows_graph +--- +RETURN DOCUMENT("persons/alice") +``` + +```aql +--- +name: FUNCTION_DOCUMENT_8 +description: '' +dataset: knows_graph +--- +RETURN DOCUMENT( [ "persons/alice", "persons/bob" ] ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_9 +description: '' +dataset: knows_graph +bindVars: + { + "key": "persons/alice" + } +--- +RETURN DOCUMENT( @key ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_10 +description: '' +dataset: knows_graph +bindVars: + { + "keys": ["persons/alice", "persons/bob"] + } +--- +RETURN DOCUMENT( @keys ) +``` + +```aql +--- +name: FUNCTION_DOCUMENT_11 +description: '' +dataset: knows_graph +bindVars: + { + "key": "bob" + } +--- +RETURN DOCUMENT( CONCAT("persons/", @key) ) +``` + +### LENGTH() + +`LENGTH(coll) → documentCount` + +Determine the amount of documents in a collection. + +It calls [`COLLECTION_COUNT()`](#collection_count) internally. + +- **coll** (collection): a collection (not string) +- returns **documentCount** (number): the total amount of documents in *coll* + +`LENGTH()` can also determine the [number of elements](array.md#length) in an array, +the [number of attribute keys](document-object.md#length) of an object / document and +the [character length](string.md#length) of a string. + +### SHARD_ID() + +`SHARD_ID(collection, shardKeys) → shardId` + +Return the shard in a collection that contains the specified shard keys. + +- **collection** (string): a collection name +- **shardKeys** (object): a set of shard keys and values. Any missing shard key + is substituted with the `null` value. +- returns **shardId** (string): the responsible shard for the specified + shard keys in the given collection. On deployments other than clusters, + the collection name itself is returned. + +```aql +--- +name: shard_id1 +description: '' +type: cluster +dataset: observationsSampleDataset +--- +RETURN SHARD_ID("observations", { "time": "2021-05-25 07:15:00", "subject": "xh458", "val": 10 }) +``` + +## Hash functions + +### HASH() + +`HASH(value) → hashNumber` + +Calculate a hash value for *value*. + +- **value** (any): an element of arbitrary type +- returns **hashNumber** (number): a hash value of *value* + +*value* is not required to be a string, but can have any data type. The calculated +hash value will take the data type of *value* into account, so for example the +number *1* and the string *"1"* will have different hash values. For arrays the +hash values will be equal if the arrays contain exactly the same values +(including value types) in the same order. For objects the same hash values will +be created if the objects have exactly the same attribute names and values +(including value types). The order in which attributes appear inside objects +is not important for hashing. + +The hash value returned by this function is a number. The hash algorithm is not +guaranteed to remain the same in future versions of ArangoDB. The hash values +should therefore be used only for temporary calculations, e.g. to compare if two +documents are the same, or for grouping values in queries. + +### MINHASH() + +`MINHASH(values, numHashes) → hashes` + +Calculate MinHash signatures for the *values* using locality-sensitive hashing. +The result can be used to approximate the Jaccard similarity of sets. + +- **values** (array): an array with elements of arbitrary type to hash +- **numHashes** (number): the size of the MinHash signature. Must be + greater or equal to `1`. The signature size defines the probabilistic error + (`err = rsqrt(numHashes)`). For an error amount that does not exceed 5% + (`0.05`), use a size of `1 / (0.05 * 0.05) = 400`. +- returns **hashes** (array): an array of strings with the encoded hash values + +**Examples** + +```aql +--- +name: aqlMinHash +description: '' +--- +RETURN MINHASH(["foo", "bar", "baz"], 5) +``` + +### MINHASH_COUNT() + +`MINHASH_COUNT(error) → numHashes` + +Calculate the number of hashes (MinHash signature size) needed to not exceed the +specified error amount. + +- **error** (number): the probabilistic error you can tolerate in the range `[0, 1)` +- returns **numHashes** (number): the required number of hashes to not exceed + the specified error amount + +**Examples** + +```aql +--- +name: aqlMinHashCount +description: '' +--- +RETURN MINHASH_COUNT(0.05) +``` + +### MINHASH_ERROR() + +`MINHASH_ERROR(numHashes) → error` + +Calculate the error amount based on the number of hashes (MinHash signature size). + +- **numHashes** (number): the number of hashes you want to check +- returns **error** (number): the probabilistic error to expect with the specified + number of hashes + +**Examples** + +```aql +--- +name: aqlMinHashError +description: '' +--- +RETURN MINHASH_ERROR(400) +``` + +### String-based hashing + +See the following string functions: + +- [`CRC32()`](string.md#crc32) +- [`FNV64()`](string.md#fnv64) +- [`MD5()`](string.md#md5) +- [`SHA1()`](string.md#sha1) +- [`SHA512()`](string.md#sha512) + +## Function calling + +### APPLY() + +`APPLY(functionName, arguments) → retVal` + +Dynamically call the function *funcName* with the arguments specified. +Arguments are given as array and are passed as separate parameters to +the called function. + +Both built-in and user-defined functions can be called. + +- **funcName** (string): a function name +- **arguments** (array, *optional*): an array with elements of arbitrary type +- returns **retVal** (any): the return value of the called function + +```aql +APPLY( "SUBSTRING", [ "this is a test", 0, 7 ] ) +// "this is" +``` + +### CALL() + +`CALL(funcName, arg1, arg2, ... argN) → retVal` + +Dynamically call the function *funcName* with the arguments specified. +Arguments are given as multiple parameters and passed as separate +parameters to the called function. + +Both built-in and user-defined functions can be called. + +- **funcName** (string): a function name +- **args** (any, *repeatable*): an arbitrary number of elements as + multiple arguments, can be omitted +- returns **retVal** (any): the return value of the called function + +```aql +CALL( "SUBSTRING", "this is a test", 0, 4 ) +// "this" +``` + +## Other functions + +### ASSERT() / WARN() + +`ASSERT(expr, message) → retVal`\ +`WARN(expr, message) → retVal` + +The two functions evaluate an expression. In case the expression evaluates to +*true* both functions will return *true*. If the expression evaluates to +*false* *ASSERT* will throw an error and *WARN* will issue a warning and return +*false*. This behavior allows the use of *ASSERT* and *WARN* in `FILTER` +conditions. + +- **expr** (expression): AQL expression to be evaluated +- **message** (string): message that will be used in exception or warning if expression evaluates to false +- returns **retVal** (bool): returns true if expression evaluates to true + +```aql +FOR i IN 1..3 FILTER ASSERT(i > 0, "i is not greater 0") RETURN i +FOR i IN 1..3 FILTER WARN(i < 2, "i is not smaller 2") RETURN i +``` + +### IN_RANGE() + +`IN_RANGE(value, low, high, includeLow, includeHigh) → included` + +Returns true if *value* is greater than (or equal to) *low* and less than +(or equal to) *high*. The values can be of different types. They are compared +as described in [Type and value order](../fundamentals/type-and-value-order.md) and +is thus identical to the comparison operators `<`, `<=`, `>` and `>=` in +behavior. + +- **value** (any): an element of arbitrary type +- **low** (any): minimum value of the desired range +- **high** (any): maximum value of the desired range +- **includeLow** (bool): whether the minimum value shall be included in + the range (left-closed interval) or not (left-open interval) +- **includeHigh** (bool): whether the maximum value shall be included in + the range (right-closed interval) or not (right-open interval) +- returns **included** (bool): whether *value* is in the range + +If *low* and *high* are the same, but *includeLow* and/or *includeHigh* is set +to `false`, then nothing will match. If *low* is greater than *high* nothing will +match either. + +{{< info >}} +The regular `IN_RANGE()` function cannot utilize indexes, unlike its +ArangoSearch counterpart which can use the View index. +{{< /info >}} + +```aql +--- +name: aqlMiscInRange_1 +description: '' +--- +LET value = 4 +RETURN IN_RANGE(value, 3, 5, true, true) +/* same as: + RETURN value >= 3 AND value <= 5 +*/ +``` + + + +```aql +--- +name: aqlMiscInRange_2 +description: '' +--- +FOR value IN 2..6 + RETURN { value, in_range: IN_RANGE(value, 3, 5, false, true) } + /* same as: + RETURN { value, in_range: value > 3 AND value <= 5 } + */ +``` + + + +```aql +--- +name: aqlMiscInRange_3 +description: '' +--- +LET coll = [ + { text: "fennel" }, + { text: "fox grape" }, + { text: "forest strawberry" }, + { text: "fungus" } +] +FOR doc IN coll + FILTER IN_RANGE(doc.text,"fo", "fp", true, false) // values with prefix "fo" + /* same as: + FILTER doc.text >= "fo" AND doc.text < "fp" + */ + RETURN doc +``` + +### PREGEL_RESULT() + +`PREGEL_RESULT(jobId, withId) → results` + +Allows to access results of a Pregel job that are only held in memory. +See [Pregel AQL integration](../../data-science/pregel/_index.md#aql-integration). + +- **jobId** (string): the `id` of a Pregel job +- **withId** (bool): if enabled, then the document `_id` is returned in + addition to the `_key` for each vertex +- returns **results** (array): an array of objects, one element per vertex, with + the attributes computed by the Pregel algorithm and the document key (and + optionally identifier) + +## Internal functions + +The following functions are used during development of ArangoDB as a database +system, primarily for unit testing. They are not intended to be used by end +users, especially not in production environments. + +### FAIL() + +`FAIL(reason)` + +Let a query fail on purpose. Can be used in a conditional branch, or to verify +if lazy evaluation / short circuiting is used for instance. + +- **reason** (string): an error message +- returns nothing, because the query is aborted + +```aql +RETURN 1 == 1 ? "okay" : FAIL("error") // "okay" +RETURN 1 == 1 || FAIL("error") ? true : false // true +RETURN 1 == 2 && FAIL("error") ? true : false // false +RETURN 1 == 1 && FAIL("error") ? true : false // aborted with error +``` + +### NOOPT() / NOEVAL() + +`NOOPT(value) → retVal` + +No-operation that prevents certain query compile-time and run-time optimizations. +Constant expressions can be forced to be evaluated at runtime with this. +This function is marked as non-deterministic so its argument withstands +query optimization. + +`NOEVAL(value) → retVal` + +Same as `NOOPT()`, except that it is marked as deterministic. + +There is no need to call these functions explicitly, they are mainly used for +internal testing. + +- **value** (any): a value of arbitrary type +- returns **retVal** (any): *value* + +```aql +// differences in execution plan (explain) +FOR i IN 1..3 RETURN (1 + 1) // const assignment +FOR i IN 1..3 RETURN NOOPT(1 + 1) // simple expression +FOR i IN 1..3 RETURN NOEVAL(1 + 1) // simple expression + +RETURN NOOPT( 123 ) // evaluates 123 at runtime +RETURN NOOPT( CONCAT("a", "b") ) // evaluates concatenation at runtime +``` + +### PASSTHRU() + +`PASSTHRU(value) → retVal` + +Simply returns its call argument unmodified. There is no need to call this function +explicitly, it is mainly used for internal testing. + +- **value** (any): a value of arbitrary type +- returns **retVal** (any): *value* + +### SCHEMA_GET() + +`SCHEMA_GET(collection) → schema` + +Return the schema definition as defined in the properties of the +specified collection. + +- **collection** (string): name of a collection +- returns **schema** (object): schema definition object + +```aql +RETURN SCHEMA_GET("myColl") +``` + +### SCHEMA_VALIDATE() + +`SCHEMA_VALIDATE(doc, schema) → result` + +Test if the given document is valid according to the schema definition. + +- **doc** (doc): document +- **schema** (object): schema definition object +- returns **result** (object): an object with the following attributes: + - **valid** (bool): `true` if the document fulfills the schema's requirements, + otherwise it will be `false` and *errorMessage* will be set + - **errorMessage** (string): details about the validation failure + +If the input document **doc** is not an object, the function will return +a *null* value and register a warning in the query. + +Using an empty **schema** object is equivalent to specifying a **schema** +value of *null*, which will make all input objects successfully pass the +validation. + +### SLEEP() + +`SLEEP(seconds) → null` + +Wait for a certain amount of time before continuing the query. + +- **seconds** (number): amount of time to wait +- returns a *null* value + +```aql +SLEEP(1) // wait 1 second +SLEEP(0.02) // wait 20 milliseconds +``` + +### V8() + +`V8(expression) → retVal` + +No-operation that enforces the usage of the V8 JavaScript engine. There is +no need to call this function explicitly, it is mainly used for internal +testing. + +- **expression** (any): arbitrary expression +- returns **retVal** (any): the return value of the *expression* + +```aql +// differences in execution plan (explain) +FOR i IN 1..3 RETURN (1 + 1) // const assignment +FOR i IN 1..3 RETURN V8(1 + 1) // simple expression +``` + +### VERSION() + +`VERSION() → serverVersion` + +Returns the server version as a string. In a cluster, returns the version +of the Coordinator. + +- returns **serverVersion** (string): the server version string + +```aql +RETURN VERSION() // e.g. "3.10.0" +``` diff --git a/site/content/arangodb/oem/aql/functions/numeric.md b/site/content/arangodb/oem/aql/functions/numeric.md new file mode 100644 index 0000000000..401bae6b71 --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/numeric.md @@ -0,0 +1,770 @@ +--- +title: Numeric functions in AQL +menuTitle: Numeric +weight: 45 +description: >- + AQL offers functions for numeric calculations +--- +## ABS() + +`ABS(value) → unsignedValue` + +Return the absolute part of *value*. + +- **value** (number): any number, positive or negative +- returns **unsignedValue** (number): the number without + or - sign + +```aql +ABS(-5) // 5 +ABS(+5) // 5 +ABS(3.5) // 3.5 +``` + +## ACOS() + +`ACOS(value) → num` + +Return the arccosine of *value*. + +- **value** (number): the input value +- returns **num** (number\|null): the arccosine of *value*, or *null* if *value* is + outside the valid range -1 and 1 (inclusive) + +```aql +ACOS(-1) // 3.141592653589793 +ACOS(0) // 1.5707963267948966 +ACOS(1) // 0 +ACOS(2) // null +``` + +## ASIN() + +`ASIN(value) → num` + +Return the arcsine of *value*. + +- **value** (number): the input value +- returns **num** (number\|null): the arcsine of *value*, or *null* if *value* is + outside the valid range -1 and 1 (inclusive) + +```aql +ASIN(1) // 1.5707963267948966 +ASIN(0) // 0 +ASIN(-1) // -1.5707963267948966 +ASIN(2) // null +``` + +## ATAN() + +`ATAN(value) → num` + +Return the arctangent of *value*. + +- **value** (number): the input value +- returns **num** (number): the arctangent of *value* + +```aql +ATAN(-1) // -0.7853981633974483 +ATAN(0) // 0 +ATAN(10) // 1.4711276743037347 +``` + +## ATAN2() + +`ATAN2(y, x) → num` + +Return the arctangent of the quotient of *y* and *x*. + +```aql +ATAN2(0, 0) // 0 +ATAN2(1, 0) // 1.5707963267948966 +ATAN2(1, 1) // 0.7853981633974483 +ATAN2(-10, 20) // -0.4636476090008061 +``` + +## AVERAGE() + +`AVERAGE(numArray) → mean` + +Return the average (arithmetic mean) of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **mean** (number\|null): the average value of *numArray*. If the array is + empty or contains *null* values only, *null* will be returned. + +```aql +AVERAGE( [5, 2, 9, 2] ) // 4.5 +AVERAGE( [ -3, -5, 2 ] ) // -2 +AVERAGE( [ 999, 80, 4, 4, 4, 3, 3, 3 ] ) // 137.5 +``` + +## AVG() + +This is an alias for [`AVERAGE()`](#average). + +## CEIL() + +`CEIL(value) → roundedValue` + +Return the integer closest but not less than *value*. + +To round downward, see [`FLOOR()`](#floor).\ +To round to the nearest integer value, see [`ROUND()`](#round). + +- **value** (number): any number +- returns **roundedValue** (number): the value rounded to the ceiling + +```aql +CEIL(2.49) // 3 +CEIL(2.50) // 3 +CEIL(-2.50) // -2 +CEIL(-2.51) // -2 +``` + +## COS() + +`COS(value) → num` + +Return the cosine of *value*. + +- **value** (number): the input value +- returns **num** (number): the cosine of *value* + +```aql +COS(1) // 0.5403023058681398 +COS(0) // 1 +COS(-3.141592653589783) // -1 +COS(RADIANS(45)) // 0.7071067811865476 +``` + +## COSINE_SIMILARITY() + +Introduced in: v3.9.0 + +`COSINE_SIMILARITY(x, y) → num` + +Return the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) +between *x* and *y*. + +To calculate the distance, see [`L1_DISTANCE()`](#l1_distance) and +[`L2_DISTANCE()`](#l2_distance). + +- **x** (array): first input array +- **y** (array): second input array +- returns **num** (number\|array): the cosine similarity value. + If one of the inputs is a nested (2D) array, then an array is returned. + The length of each 2D array row should be equal to the length of second input + array in that case. + +In case of invalid input values the function returns **null** and produces a warning. + +```aql +COSINE_SIMILARITY([0,1], [1,0]) // 0 +COSINE_SIMILARITY([[0,1,0,1],[1,0,0,1],[1,1,1,0],[0,0,0,1]], [1,1,1,1]) // [0.707, 0.707, 0.866, 0.5] +COSINE_SIMILARITY([-1,0], [1,0]) // -1 +``` + +## DECAY_GAUSS() + +Introduced in: v3.9.0 + +`DECAY_GAUSS(value, origin, scale, offset, decay) → score` + +Calculate the score for one or multiple values with a **Gaussian function** that +decays depending on the distance of a numeric value from a user-given origin. + +- **value** (number\|array): the input value or an array with input values +- **origin** (number): the point of origin used for calculating the distance +- **scale** (number): defines the distance from `origin` + `offset` at which + the computed score will equal the `decay` parameter +- **offset** (number): the decay function will be evaluated for distance values + greater than the defined offset +- **decay** (number): the decay parameter defines how input values are scored + at the distance given by the `scale` parameter +- returns **score** (number\|array): a single score or an array of scores + depending on the type of the input `value` + +```aql +DECAY_GAUSS(41, 40, 5, 5, 0.5) // 1 +DECAY_GAUSS([20, 41], 40, 5, 5, 0.5) // [0.0019531250000000017, 1.0] +DECAY_GAUSS(49.9889, 49.987, 0.001, 0.001, 0.2) // 0.2715403018822964 +``` + +## DECAY_EXP() + +Introduced in: v3.9.0 + +`DECAY_EXP(value, origin, scale, offset, decay) → num, array` + +Calculate the score for one or multiple values with an **exponential function** +that decays depending on the distance of a numeric value from a user-given origin. + +- **value** (number\|array): the input value or an array with input values +- **origin** (number): the point of origin used for calculating the distance +- **scale** (number): defines the distance from `origin` + `offset` at which + the computed score will equal the `decay` parameter +- **offset** (number): the decay function will be evaluated for distance values + greater than the defined offset +- **decay** (number): the decay parameter defines how input values are scored + at the distance given by the `scale` parameter +- returns **score** (number\|array): a single score or an array of scores + depending on the type of the input `value` + +```aql +DECAY_EXP(41, 40, 5, 5, 0.7) // 1 +DECAY_EXP(2, 0, 10, 0, 0.2) // 0.7247796636776955 +DECAY_EXP(49.9889, 50, 0.001, 0.001, 0.2) // 8.717720806626885e-08 +``` + +## DECAY_LINEAR() + +Introduced in: v3.9.0 + +`DECAY_LINEAR(value, origin, scale, offset, decay) → score` + +Calculate the score for one or multiple values with a **linear function** that +decays depending on the distance of a numeric value from a user-given origin. + +- **value** (number\|array): the input value or an array with input values +- **origin** (number): the point of origin used for calculating the distance +- **scale** (number): defines the distance from `origin` + `offset` at which + the computed score will equal the `decay` parameter +- **offset** (number): the decay function will be evaluated for distance values + greater than the defined offset +- **decay** (number): the decay parameter defines how input values are scored + at the distance given by the `scale` parameter +- returns **score** (number\|array): a single score or an array of scores + depending on the type of the input `value` + +```aql +DECAY_LINEAR(41, 40, 5, 5, 0.5) // 1 +DECAY_LINEAR(9.8, 0, 10, 0, 0.2) // 0.21599999999999994 +DECAY_LINEAR(5..7, 0, 10, 0, 0.2) // [0.6, 0.52, 0.44] +``` + +## DEGREES() + +`DEGREES(rad) → num` + +Return the angle converted from radians to degrees. + +- **rad** (number): the input value +- returns **num** (number): the angle in degrees + +```aql +DEGREES(0.7853981633974483) // 45 +DEGREES(0) // 0 +DEGREES(3.141592653589793) // 180 +``` + +## EXP() + +`EXP(value) → num` + +Return Euler's constant (2.71828...) raised to the power of *value*. + +- **value** (number): the input value +- returns **num** (number): Euler's constant raised to the power of *value* + +```aql +EXP(1) // 2.718281828459045 +EXP(10) // 22026.46579480671 +EXP(0) // 1 +``` + +## EXP2() + +`EXP2(value) → num` + +Return 2 raised to the power of *value*. + +- **value** (number): the input value +- returns **num** (number): 2 raised to the power of *value* + +```aql +EXP2(16) // 65536 +EXP2(1) // 2 +EXP2(0) // 1 +``` + +## FLOOR() + +`FLOOR(value) → roundedValue` + +Return the integer closest but not greater than *value*. + +To round upward, see [`CEIL()`](#ceil).\ +To round to the nearest integer value, see [`ROUND()`](#round). + +- **value** (number): any number +- returns **roundedValue** (number): the value rounded downward + +```aql +FLOOR(2.49) // 2 +FLOOR(2.50) // 2 +FLOOR(-2.50) // -3 +FLOOR(-2.51) // -3 +``` + +## LOG() + +`LOG(value) → num` + +Return the natural logarithm of *value*. The base is Euler's +constant (2.71828...). + +- **value** (number): the input value +- returns **num** (number\|null): the natural logarithm of *value*, or *null* if *value* is + equal or less than 0 + +```aql +LOG(2.718281828459045) // 1 +LOG(10) // 2.302585092994046 +LOG(0) // null +``` + +## LOG2() + +`LOG2(value) → num` + +Return the base 2 logarithm of *value*. + +- **value** (number): the input value +- returns **num** (number\|null): the base 2 logarithm of *value*, or *null* if *value* is + equal or less than 0 + +```aql +LOG2(1024) // 10 +LOG2(8) // 3 +LOG2(0) // null +``` + +## LOG10() + +`LOG10(value) → num` + +Return the base 10 logarithm of *value*. + +- **value** (number): the input value +- returns **num** (number): the base 10 logarithm of *value*, or *null* if *value* is + equal or less than 0 + +```aql +LOG10(10000) // 4 +LOG10(10) // 1 +LOG10(0) // null +``` + +## L1_DISTANCE() + +Introduced in: v3.9.0 + +`L1_DISTANCE(x, y) → num` + +Return the [Manhattan distance](https://en.wikipedia.org/wiki/Taxicab_geometry) +between *x* and *y*. + +To calculate the similarity, see [`COSINE_SIMILARITY()`](#cosine_similarity). + +- **x** (array): first input array +- **y** (array): second input array +- returns **num** (number\|array): the L1 distance value. + If one of the inputs is a nested (2D) array, then an array is returned. + The length of each inner array should be equal to the length of second input + array in that case. + +In case of invalid input values the function returns **null** and produces a warning. + +```aql +L1_DISTANCE([-1,-1], [2,2]) // 6 +L1_DISTANCE([[1,2,3],[-1,-2,-3],[3,4,5],[-5,2,1]], [1,1,1]) // [3,9,9,7] +L1_DISTANCE([1.5], [3]) // 1.5 +``` + +## L2_DISTANCE() + +Introduced in: v3.9.0 + +`L2_DISTANCE(x,y) → num` + +Return the [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) +between *x* and *y*. + +To calculate the similarity, see [`COSINE_SIMILARITY()`](#cosine_similarity). + +- **x** (array): first input array +- **y** (array): second input array +- returns **num** (number\|array): the L2 distance value. + If one of the inputs is a nested (2D) array, then an array is returned. + The length of each inner array should be equal to the length of second input + array in that case. + +In case of invalid input values the function returns **null** and produces a warning. + +```aql +L2_DISTANCE([1,1], [5,2]) // 4.1231056256176606 +L2_DISTANCE([[1,2,3], [4,5,6], [7,8,9]], [3,2,1]) // [2.8284271247461903, 5.916079783099616, 10.770329614269007] +L2_DISTANCE([0,1], [1,0]) // 1.4142135623730951 +``` + +## MAX() + +`MAX(anyArray) → max` + +Return the greatest element of *anyArray*. The array is not limited to numbers. +Also see [type and value order](../fundamentals/type-and-value-order.md). + +- **anyArray** (array): an array of numbers, *null* values are ignored +- returns **max** (any\|null): the element with the greatest value. If the array is + empty or contains *null* values only, the function will return *null*. + +```aql +MAX( [5, 9, -2, null, 1] ) // 9 +MAX( [ null, null ] ) // null +``` + +## MEDIAN() + +`MEDIAN(numArray) → median` + +Return the median value of the values in *array*. + +The array is sorted and the element in the middle is returned. If the array has an +even length of elements, the two center-most elements are interpolated by calculating +the average value (arithmetic mean). + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **median** (number\|null): the median of *numArray*. If the array is + empty or contains *null* values only, the function will return *null*. + +```aql +MEDIAN( [ 1, 2, 3] ) // 2 +MEDIAN( [ 1, 2, 3, 4 ] ) // 2.5 +MEDIAN( [ 4, 2, 3, 1 ] ) // 2.5 +MEDIAN( [ 999, 80, 4, 4, 4, 3, 3, 3 ] ) // 4 +``` + +## MIN() + +`MIN(anyArray) → min` + +Return the smallest element of *anyArray*. The array is not limited to numbers. +Also see [type and value order](../fundamentals/type-and-value-order.md). + +- **anyArray** (array): an array of numbers, *null* values are ignored +- returns **min** (any\|null): the element with the smallest value. If the array is + empty or contains *null* values only, the function will return *null*. + +```aql +MIN( [5, 9, -2, null, 1] ) // -2 +MIN( [ null, null ] ) // null +``` + +## PERCENTILE() + +`PERCENTILE(numArray, n, method) → percentile` + +Return the *n*th percentile of the values in *numArray*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- **n** (number): must be between 0 (excluded) and 100 (included) +- **method** (string, *optional*): "rank" (default) or "interpolation" +- returns **percentile** (number\|null): the *n*th percentile, or *null* if the + array is empty or only *null* values are contained in it or the percentile + cannot be calculated + +```aql +PERCENTILE( [1, 2, 3, 4], 50 ) // 2 +PERCENTILE( [1, 2, 3, 4], 50, "rank" ) // 2 +PERCENTILE( [1, 2, 3, 4], 50, "interpolation" ) // 2.5 +``` + +## PI() + +`PI() → pi` + +Return pi. + +- returns **pi** (number): the first few significant digits of pi (3.141592653589793) + +```aql +PI() // 3.141592653589793 +``` + +## POW() + +`POW(base, exp) → num` + +Return the *base* to the exponent *exp*. + +- **base** (number): the base value +- **exp** (number): the exponent value +- returns **num** (number): the exponentiated value + +```aql +POW( 2, 4 ) // 16 +POW( 5, -1 ) // 0.2 +POW( 5, 0 ) // 1 +``` + +## PRODUCT() + +`PRODUCT(numArray) → product` + +Return the product of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **product** (number): the product of all values in *numArray*. If the array + is empty or only *null* values are contained in the array, *1* will be returned. + +```aql +PRODUCT( [1, 2, 3, 4] ) // 24 +PRODUCT( [null, -5, 6] ) // -30 +PRODUCT( [ ] ) // 1 +``` + +## RADIANS() + +`RADIANS(deg) → num` + +Return the angle converted from degrees to radians. + +- **deg** (number): the input value +- returns **num** (number): the angle in radians + +```aql +RADIANS(180) // 3.141592653589793 +RADIANS(90) // 1.5707963267948966 +RADIANS(0) // 0 +``` + +## RAND() + +`RAND() → randomNumber` + +Return a pseudo-random number between 0 and 1. + +- returns **randomNumber** (number): a number greater than 0 and less than 1 + +```aql +RAND() // 0.3503170117504508 +RAND() // 0.6138226173882478 +``` + +Complex example: + +```aql +LET coinFlips = ( + FOR i IN 1..100000 + RETURN RAND() > 0.5 ? "heads" : "tails" +) +RETURN MERGE( + FOR flip IN coinFlips + COLLECT f = flip WITH COUNT INTO count + RETURN { [f]: count } +) +``` + +Result: + +```json +[ + { + "heads": 49902, + "tails": 50098 + } +] +``` + +## RANGE() + +`RANGE(start, stop, step) → numArray` + +Return an array of numbers in the specified range, optionally with increments +other than 1. The *start* and *stop* arguments are truncated to integers +unless a *step* argument is provided. + +Also see the [range operator](../operators.md#range-operator) for ranges +with integer bounds and a step size of 1. + +- **start** (number): the value to start the range at (inclusive) +- **stop** (number): the value to end the range with (inclusive) +- **step** (number, *optional*): how much to increment in every step, + the default is *1.0* +- returns **numArray** (array): all numbers in the range as array + +```aql +RANGE(1, 4) // [ 1, 2, 3, 4 ] +RANGE(1, 4, 2) // [ 1, 3 ] +RANGE(1, 4, 3) // [ 1, 4 ] +RANGE(1.5, 2.5) // [ 1, 2 ] +RANGE(1.5, 2.5, 1) // [ 1.5, 2.5 ] +RANGE(1.5, 2.5, 0.5) // [ 1.5, 2, 2.5 ] +RANGE(-0.75, 1.1, 0.5) // [ -0.75, -0.25, 0.25, 0.75 ] +``` + +## ROUND() + +`ROUND(value) → roundedValue` + +Return the integer closest to *value*. + +- **value** (number): any number +- returns **roundedValue** (number): the value rounded to the closest integer + +```aql +ROUND(2.49) // 2 +ROUND(2.50) // 3 +ROUND(-2.50) // -2 +ROUND(-2.51) // -3 +``` + +Rounding towards zero, also known as `trunc()` in C/C++, can be achieved with +a combination of the [ternary operator](../operators.md#ternary-operator), +[`CEIL()`](#ceil) and [`FLOOR()`](#floor): + +```aql +value >= 0 ? FLOOR(value) : CEIL(value) +``` + +## SIN() + +`SIN(value) → num` + +Return the sine of *value*. + +- **value** (number): the input value +- returns **num** (number): the sine of *value* + +```aql +SIN(3.141592653589783 / 2) // 1 +SIN(0) // 0 +SIN(-3.141592653589783 / 2) // -1 +SIN(RADIANS(270)) // -1 +``` + +## SQRT() + +`SQRT(value) → squareRoot` + +Return the square root of *value*. + +- **value** (number): a number +- returns **squareRoot** (number): the square root of *value* + +```aql +SQRT(9) // 3 +SQRT(2) // 1.4142135623730951 +``` + +Other roots can be calculated with [`POW()`](#pow) like `POW(value, 1/n)`: + +```aql +// 4th root of 8*8*8*8 = 4096 +POW(4096, 1/4) // 8 + +// cube root of 3*3*3 = 27 +POW(27, 1/3) // 3 + +// square root of 3*3 = 9 +POW(9, 1/2) // 3 +``` + +## STDDEV_POPULATION() + +`STDDEV_POPULATION(numArray) → num` + +Return the population standard deviation of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **num** (number\|null): the population standard deviation of *numArray*. + If the array is empty or only *null* values are contained in the array, + *null* will be returned. + +```aql +STDDEV_POPULATION( [ 1, 3, 6, 5, 2 ] ) // 1.854723699099141 +``` + +## STDDEV_SAMPLE() + +`STDDEV_SAMPLE(numArray) → num` + +Return the sample standard deviation of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **num** (number\|null): the sample standard deviation of *numArray*. + If the array is empty or only *null* values are contained in the array, + *null* will be returned. + +```aql +STDDEV_SAMPLE( [ 1, 3, 6, 5, 2 ] ) // 2.0736441353327724 +``` + +## STDDEV() + +This is an alias for [`STDDEV_POPULATION()`](#stddev_population). + +## SUM() + +`SUM(numArray) → sum` + +Return the sum of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **sum** (number): the total of all values in *numArray*. If the array + is empty or only *null* values are contained in the array, *0* will be returned. + +```aql +SUM( [1, 2, 3, 4] ) // 10 +SUM( [null, -5, 6] ) // 1 +SUM( [ ] ) // 0 +``` + +## TAN() + +`TAN(value) → num` + +Return the tangent of *value*. + +- **value** (number): the input value +- returns **num** (number): the tangent of *value* + +```aql +TAN(10) // 0.6483608274590866 +TAN(5) // -3.380515006246586 +TAN(0) // 0 +``` + +## VARIANCE_POPULATION() + +`VARIANCE_POPULATION(numArray) → num` + +Return the population variance of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **num** (number\|null): the population variance of *numArray*. + If the array is empty or only *null* values are contained in the array, + *null* will be returned. + +```aql +VARIANCE_POPULATION( [ 1, 3, 6, 5, 2 ] ) // 3.4400000000000004 +``` + +## VARIANCE_SAMPLE() + +`VARIANCE_SAMPLE(array) → num` + +Return the sample variance of the values in *array*. + +- **numArray** (array): an array of numbers, *null* values are ignored +- returns **num** (number\|null): the sample variance of *numArray*. + If the array is empty or only *null* values are contained in the array, + *null* will be returned. + +```aql +VARIANCE_SAMPLE( [ 1, 3, 6, 5, 2 ] ) // 4.300000000000001 +``` + +## VARIANCE() + +This is an alias for [`VARIANCE_POPULATION()`](#variance_population). diff --git a/site/content/arangodb/oem/aql/functions/string.md b/site/content/arangodb/oem/aql/functions/string.md new file mode 100644 index 0000000000..772f3c663e --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/string.md @@ -0,0 +1,2070 @@ +--- +title: String functions in AQL +menuTitle: String +weight: 50 +description: >- + AQL offers functions for string processing +--- +## CHAR_LENGTH() + +`CHAR_LENGTH(str) → length` + +Return the number of characters in `str` (not byte length). + +| Input | Length | +|--------|--------| +| String | Number of Unicode characters | +| Number | Number of Unicode characters that represent the number | +| Array | Number of Unicode characters from the resulting stringification | +| Object | Number of Unicode characters from the resulting stringification | +| true | 4 | +| false | 5 | +| null | 0 | + +- **str** (string): a string. If a number is passed, it will be casted to string first. +- returns **length** (number): the character length of `str` (not byte length) + +**Examples** + +```aql +--- +name: aqlCharLength_1 +description: '' +--- +RETURN CHAR_LENGTH("foo") +``` + +```aql +--- +name: aqlCharLength_2 +description: '' +--- +LET value = {foo: "bar"} +RETURN { + str: JSON_STRINGIFY(value), + len: CHAR_LENGTH(value) +} +``` + +## CONCAT() + +`CONCAT(value1, value2, ... valueN) → str` + +Concatenate the values passed as `value1` to `valueN`. + +- **values** (any, *repeatable*): elements of arbitrary type (at least 1) +- returns **str** (string): a concatenation of the elements. `null` values + are ignored. Array and object values are JSON-encoded in their entirety. + +**Examples** + +```aql +--- +name: aqlConcatStrings_1 +description: '' +--- +RETURN CONCAT("foo", "bar", "baz") +``` + +```aql +--- +name: aqlConcatNumbers_1 +description: '' +--- +RETURN CONCAT(1, 2, 3) +``` + +```aql +--- +name: aqlConcatPrimitiveTypes_1 +description: '' +--- +RETURN CONCAT(null, false, 0, true, "") +``` + +```aql +--- +name: aqlConcatCompoundTypes_1 +description: '' +--- +RETURN CONCAT([5, 6], {foo: "bar"}) +``` + +--- + +`CONCAT(anyArray) → str` + +If a single array is passed to `CONCAT()`, its members are concatenated. + +- **anyArray** (array): array with elements of arbitrary type +- returns **str** (string): a concatenation of the array elements. `null` values + are ignored. Array and object values are JSON-encoded in their entirety. + +```aql +--- +name: aqlConcatStrings_2 +description: '' +--- +RETURN CONCAT( [ "foo", "bar", "baz" ] ) +``` + +```aql +--- +name: aqlConcatNumbers_2 +description: '' +--- +RETURN CONCAT( [1, 2, 3] ) +``` + +```aql +--- +name: aqlConcatPrimitiveTypes_2 +description: '' +--- +RETURN CONCAT( [null, false, 0, true, ""] ) +``` + +```aql +--- +name: aqlConcatCompoundTypes_2 +description: '' +--- +RETURN CONCAT( [[5, 6], {foo: "bar"}] ) +``` + +## CONCAT_SEPARATOR() + +`CONCAT_SEPARATOR(separator, value1, value2, ... valueN) → joinedString` + +Concatenate the strings passed as arguments `value1` to `valueN` using the +*separator* string. + +- **separator** (string): an arbitrary separator string +- **values** (string\|array, *repeatable*): strings or arrays of strings as multiple + arguments (at least 1) +- returns **joinedString** (string): a concatenated string of the elements, using + `separator` as separator string. `null` values are ignored. Array and object + values are JSON-encoded in their entirety. + +**Examples** + +```aql +--- +name: aqlConcatSeparatorStrings_1 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", "foo", "bar", "baz") +``` + +```aql +--- +name: aqlConcatSeparatorNumbers_1 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", 1, 2, 3) +``` + +```aql +--- +name: aqlConcatSeparatorPrimitiveTypes_1 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", null, false, 0, true, "") +``` + +```aql +--- +name: aqlConcatSeparatorCompoundTypes_1 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", [5, 6], {foo: "bar"}) +``` + +--- + +`CONCAT_SEPARATOR(separator, anyArray) → joinedString` + +If a single array is passed as second argument to `CONCAT_SEPARATOR()`, its +members are concatenated. + +- **separator** (string): an arbitrary separator string +- **anyArray** (array): array with elements of arbitrary type +- returns **joinedString** (string): a concatenated string of the elements, using + `separator` as separator string. `null` values are ignored. Array and object + values are JSON-encoded in their entirety. + +```aql +--- +name: aqlConcatSeparatorStrings_2 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", ["foo", "bar", "baz"]) +``` + +```aql +--- +name: aqlConcatSeparatorNumbers_2 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", [1, 2, 3]) +``` + +```aql +--- +name: aqlConcatSeparatorPrimitiveTypes_2 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", [null, false, 0, true, ""]) +``` + +```aql +--- +name: aqlConcatSeparatorCompoundTypes_2 +description: '' +--- +RETURN CONCAT_SEPARATOR(", ", [[5, 6], {foo: "bar"}]) +``` + +## CONTAINS() + +`CONTAINS(text, search, returnIndex) → match` + +Check whether the string `search` is contained in the string `text`. +The string matching performed by `CONTAINS()` is case-sensitive. + +To determine if or at which position a value is included in an **array**, see the +[`POSITION()` array function](array.md#position). + +- **text** (string): the haystack +- **search** (string): the needle +- **returnIndex** (bool, *optional*): if set to `true`, the character position + of the match is returned instead of a boolean. The default is `false`. +- returns **match** (bool\|number): by default, `true` is returned if `search` + is contained in `text`, and `false` otherwise. With `returnIndex` set to `true`, + the position of the first occurrence of `search` within `text` is returned + (starting at offset 0), or `-1` if it is not contained. + +**Examples** + +```aql +--- +name: aqlContainsMatch +description: '' +--- +RETURN CONTAINS("foobarbaz", "bar") +``` + +```aql +--- +name: aqlContains +description: '' +--- +RETURN CONTAINS("foobarbaz", "horse") +``` + +```aql +--- +name: aqlContainsMatchIndex +description: '' +--- +RETURN CONTAINS("foobarbaz", "bar", true) +``` + +```aql +--- +name: aqlContainsNoMatchIndex +description: '' +--- +RETURN CONTAINS("foobarbaz", "horse", true) +``` + +## COUNT() + +This is an alias for [`LENGTH()`](#length). + +## CRC32() + +`CRC32(text) → hash` + +Calculate the CRC32 checksum for `text` and return it in a hexadecimal +string representation. The polynomial used is `0x1EDC6F41`. The initial +value used is `0xFFFFFFFF`, and the final XOR value is also `0xFFFFFFFF`. + +- **text** (string): a string +- returns **hash** (string): CRC32 checksum as hex string + +**Examples** + +```aql +--- +name: aqlCrc32 +description: '' +--- +RETURN CRC32("foobar") +``` + +## ENCODE_URI_COMPONENT() + +`ENCODE_URI_COMPONENT(value) → encodedString` + +Return the URI component-encoded string of `value`. + +- **value** (string): a string +- returns **encodedString** (string): the URI component-encoded `value` + +**Examples** + +```aql +--- +name: aqlEncodeUriComponent +description: '' +--- +RETURN ENCODE_URI_COMPONENT("fünf %") +``` + +## FIND_FIRST() + +`FIND_FIRST(text, search, start, end) → position` + +Return the position of the first occurrence of the string `search` inside the +string `text`. Positions start at 0. + +- **text** (string): the haystack +- **search** (string): the needle +- **start** (number, *optional*): limit the search to a subset of the text, + beginning at `start` +- **end** (number, *optional*): limit the search to a subset of the text, + ending at `end` +- returns **position** (number): the character position of the match. If `search` + is not contained in `text`, -1 is returned. If `search` is empty, `start` is returned. + +**Examples** + +```aql +--- +name: aqlFindFirst_1 +description: '' +--- +RETURN FIND_FIRST("foobarbaz", "ba") +``` + +```aql +--- +name: aqlFindFirst_2 +description: '' +--- +RETURN FIND_FIRST("foobarbaz", "ba", 4) +``` + +```aql +--- +name: aqlFindFirst_3 +description: '' +--- +RETURN FIND_FIRST("foobarbaz", "ba", 0, 3) +``` + +## FIND_LAST() + +`FIND_LAST(text, search, start, end) → position` + +Return the position of the last occurrence of the string `search` inside the +string `text`. Positions start at 0. + +- **text** (string): the haystack +- **search** (string): the needle +- **start** (number, *optional*): limit the search to a subset of the text, + beginning at *start* +- **end** (number, *optional*): limit the search to a subset of the text, + ending at *end* +- returns **position** (number): the character position of the match. If `search` + is not contained in `text`, -1 is returned. + If `search` is empty, the string length is returned, or `end` + 1. + +**Examples** + +```aql +--- +name: aqlFindLast_1 +description: '' +--- +RETURN FIND_LAST("foobarbaz", "ba") +``` + +```aql +--- +name: aqlFindLast_2 +description: '' +--- +RETURN FIND_LAST("foobarbaz", "ba", 7) +``` + +```aql +--- +name: aqlFindLast_3 +description: '' +--- +RETURN FIND_LAST("foobarbaz", "ba", 0, 4) +``` + +## FNV64() + +`FNV64(text) → hash` + +Calculate the FNV-1A 64 bit hash for `text` and return it in a hexadecimal +string representation. + +- **text** (string): a string +- returns **hash** (string): FNV-1A hash as hex string + +**Examples** + +```aql +--- +name: aqlFnv64 +description: '' +--- +RETURN FNV64("foobar") +``` + +## IPV4_FROM_NUMBER() + +`IPV4_FROM_NUMBER(numericAddress) → stringAddress` + +Converts a numeric IPv4 address value into its string representation. + +- **numericAddress** (number): a numeric representation of an IPv4 address, for + example produced by [`IPV4_TO_NUMBER()`](#ipv4_to_number). The number must be + an unsigned integer between 0 and 4294967295 (both inclusive). +- returns **stringAddress** (string): the string representation of the IPv4 + address. If the input `numberAddress` is not a valid representation of an + IPv4 address, the function returns `null` and produces a warning. + +**Examples** + +```aql +--- +name: aqlIPv4FromNumber_1 +description: '' +--- +RETURN IPV4_FROM_NUMBER(0) +``` + +```aql +--- +name: aqlIPv4FromNumber_2 +description: '' +--- +RETURN IPV4_FROM_NUMBER(134744072) +``` + +```aql +--- +name: aqlIPv4FromNumber_3 +description: '' +--- +RETURN IPV4_FROM_NUMBER(2130706433) +``` + +```aql +--- +name: aqlIPv4FromNumber_4 +description: '' +--- +RETURN IPV4_FROM_NUMBER(3232235521) +``` + +```aql +--- +name: aqlIPv4FromNumber_5 +description: '' +--- +RETURN IPV4_FROM_NUMBER(-23) // invalid, produces a warning +``` + +## IPV4_TO_NUMBER() + +`IPV4_TO_NUMBER(stringAddress) → numericAddress` + +Converts an IPv4 address string into its numeric representation. + +- **stringAddress** (string): a string representing an IPv4 address +- returns **numericAddress** (number): the numeric representation of the IPv4 + address, as an unsigned integer. If the input `stringAddress` is not a valid + representation of an IPv4 address, the function returns `null` and produces + a warning. + +**Examples** + +```aql +--- +name: aqlIPv4ToNumber_1 +description: '' +--- +RETURN IPV4_TO_NUMBER("0.0.0.0") +``` + +```aql +--- +name: aqlIPv4ToNumber_2 +description: '' +--- +RETURN IPV4_TO_NUMBER("8.8.8.8") +``` + +```aql +--- +name: aqlIPv4ToNumber_3 +description: '' +--- +RETURN IPV4_TO_NUMBER("127.0.0.1") +``` + +```aql +--- +name: aqlIPv4ToNumber_4 +description: '' +--- +RETURN IPV4_TO_NUMBER("192.168.0.1") +``` + +```aql +--- +name: aqlIPv4ToNumber_5 +description: '' +--- +RETURN IPV4_TO_NUMBER("milk") // invalid, produces a warning +``` + +## IS_IPV4() + +`IS_IPV4(value) → bool` + +Check if an arbitrary string is suitable for interpretation as an IPv4 address. + +- **value** (string): an arbitrary string +- returns **bool** (bool): `true` if `value` is a string that can be interpreted + as an IPv4 address. To be considered valid, the string must contain of 4 octets + of decimal numbers with 1 to 3 digits length each, allowing the values 0 to 255. + The octets must be separated by periods and must not have padding zeroes. + +**Examples** + +```aql +--- +name: aqlIsIPv4_1 +description: '' +--- +RETURN IS_IPV4("127.0.0.1") +``` + +```aql +--- +name: aqlIsIPv4_2 +description: '' +--- +RETURN IS_IPV4("8.8.8.8") +``` + +```aql +--- +name: aqlIsIPv4_3 +description: '' +--- +RETURN IS_IPV4("008.008.008.008") +``` + +```aql +--- +name: aqlIsIPv4_4 +description: '' +--- +RETURN IS_IPV4("12345.2.3.4") +``` + +```aql +--- +name: aqlIsIPv4_5 +description: '' +--- +RETURN IS_IPV4("12.34") +``` + +```aql +--- +name: aqlIsIPv4_6 +description: '' +--- +RETURN IS_IPV4(8888) +``` + +## JSON_PARSE() + +`JSON_PARSE(text) → value` + +Return an AQL value described by the JSON-encoded input string. + +- **text** (string): the string to parse as JSON +- returns **value** (any): the value corresponding to the given JSON text. + For input values that are no valid JSON strings, the function will return `null`. + +**Examples** + +```aql +--- +name: aqlJsonParse_1 +description: '' +--- +RETURN JSON_PARSE("123") +``` + +```aql +--- +name: aqlJsonParse_2 +description: '' +--- +RETURN JSON_PARSE("[ true, false, null, -0.5 ]") +``` + +```aql +--- +name: aqlJsonParse_3 +description: '' +--- +RETURN JSON_PARSE('{"a": 1}') +``` + +```aql +--- +name: aqlJsonParse_4 +description: '' +--- +RETURN JSON_PARSE('"abc"') +``` + +```aql +--- +name: aqlJsonParse_5 +description: '' +--- +RETURN JSON_PARSE("abc") // invalid JSON +``` + +## JSON_STRINGIFY() + +`JSON_STRINGIFY(value) → text` + +Return a JSON string representation of the input value. + +- **value** (any): the value to convert to a JSON string +- returns **text** (string): the JSON string representing `value`. + For input values that cannot be converted to JSON, the function + will return `null`. + +**Examples** + +```aql +--- +name: aqlJsonStringify_1 +description: '' +--- +RETURN JSON_STRINGIFY(true) +``` + +```aql +--- +name: aqlJsonStringify_2 +description: '' +--- +RETURN JSON_STRINGIFY("abc") +``` + +```aql +--- +name: aqlJsonStringify_3 +description: '' +--- +RETURN JSON_STRINGIFY( [1, {'2': .5}] ) +``` + +## LEFT() + +`LEFT(value, n) → substring` + +Return the `n` leftmost characters of the string `value`. + +To return the rightmost characters, see [`RIGHT()`](#right).\ +To take a part from an arbitrary position off the string, +see [`SUBSTRING()`](#substring). + +- **value** (string): a string +- **n** (number): how many characters to return +- returns **substring** (string): at most `n` characters of `value`, + starting on the left-hand side of the string + +**Examples** + +```aql +--- +name: aqlLeft_1 +description: '' +--- +RETURN LEFT("foobar", 3) +``` + +```aql +--- +name: aqlLeft_2 +description: '' +--- +RETURN LEFT("foobar", 10) +``` + +## LENGTH() + +`LENGTH(str) → length` + +Determine the character length of a string. + +- **str** (string): a string. If a number is passed, it will be casted to string first. +- returns **length** (number): the character length of `str` (not byte length) + +`LENGTH()` can also determine the [number of elements](array.md#length) in an array, +the [number of attribute keys](document-object.md#length) of an object / document and +the [amount of documents](miscellaneous.md#length) in a collection. + +**Examples** + +```aql +--- +name: aqlLengthString_1 +description: '' +--- +RETURN LENGTH("foobar") +``` + +```aql +--- +name: aqlLengthString_2 +description: '' +--- +RETURN LENGTH("电脑坏了") +``` + +## LEVENSHTEIN_DISTANCE() + +`LEVENSHTEIN_DISTANCE(value1, value2) → distance` + +Calculate the [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +between two strings. + +- **value1** (string): a string +- **value2** (string): a string +- returns **distance** (number): calculated Damerau-Levenshtein distance + between the input strings `value1` and `value2` + +**Examples** + +```aql +--- +name: aqlLevenshteinDistance_1 +description: '' +--- +RETURN LEVENSHTEIN_DISTANCE("foobar", "bar") +``` + +```aql +--- +name: aqlLevenshteinDistance_2 +description: '' +--- +RETURN LEVENSHTEIN_DISTANCE(" ", "") +``` + +```aql +--- +name: aqlLevenshteinDistance_3 +description: '' +--- +RETURN LEVENSHTEIN_DISTANCE("The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox") +``` + +```aql +--- +name: aqlLevenshteinDistance_4 +description: '' +--- +RETURN LEVENSHTEIN_DISTANCE("der mötör trötet", "der trötet") +``` + +## LIKE() + +`LIKE(text, search, caseInsensitive) → bool` + +Check whether the pattern `search` is contained in the string `text`, +using wildcard matching. + +- `_`: A single arbitrary character +- `%`: Zero, one or many arbitrary characters +- `\\_`: A literal underscore +- `\\%`: A literal percent sign + +{{< info >}} +Literal backlashes require different amounts of escaping depending on the +context: +- `\` in bind variables (_Table_ view mode) in the web interface (automatically + escaped to `\\` unless the value is wrapped in double quotes and already + escaped properly) +- `\\` in bind variables (_JSON_ view mode) and queries in the web interface +- `\\` in bind variables in arangosh +- `\\\\` in queries in arangosh +- Double the amount compared to arangosh in shells that use backslashes for +escaping (`\\\\` in bind variables and `\\\\\\\\` in queries) +{{< /info >}} + +The `LIKE()` function cannot be accelerated by any sort of index. However, +the [ArangoSearch `LIKE()` function](arangosearch.md#like) that +is used in the context of a `SEARCH` operation is backed by View indexes. + +- **text** (string): the string to search in +- **search** (string): a search pattern that can contain the wildcard characters + `%` (meaning any sequence of characters, including none) and `_` (any single + character). Literal `%` and `_` must be escaped with backslashes. + *search* cannot be a variable or a document attribute. The actual value must + be present at query parse time already. +- **caseInsensitive** (bool, *optional*): if set to `true`, the matching will be + case-insensitive. The default is `false`. +- returns **bool** (bool): `true` if the pattern is contained in `text`, + and `false` otherwise + +**Examples** + +```aql +--- +name: aqlLikeString_1 +description: '' +--- +RETURN [ + LIKE("cart", "ca_t"), + LIKE("carrot", "ca_t"), + LIKE("carrot", "ca%t") +] +``` + +```aql +--- +name: aqlLikeString_2 +description: '' +--- +RETURN [ + LIKE("foo bar baz", "bar"), + LIKE("foo bar baz", "%bar%"), + LIKE("bar", "%bar%") +] +``` + +```aql +--- +name: aqlLikeString_3 +description: '' +--- +RETURN [ + LIKE("FoO bAr BaZ", "fOo%bAz"), + LIKE("FoO bAr BaZ", "fOo%bAz", true) +] +``` + +## LOWER() + +`LOWER(value) → lowerCaseString` + +Convert upper-case letters in `value` to their lower-case counterparts. +All other characters are returned unchanged. + +- **value** (string): a string +- returns **lowerCaseString** (string): `value` with upper-case characters converted + to lower-case characters + +**Examples** + +```aql +--- +name: aqlLower +description: '' +--- +RETURN LOWER("AVOcado") +``` + +## LTRIM() + +`LTRIM(value, chars) → strippedString` + +Return the string `value` with whitespace stripped from the start only. + +To strip from the end only, see [`RTRIM()`](#rtrim).\ +To strip both sides, see [`TRIM()`](#trim). + +- **value** (string): a string +- **chars** (string, *optional*): override the characters that should + be removed from the string. It defaults to `\r\n \t` (i.e. `0x0d`, `0x0a`, + `0x20` and `0x09`). +- returns **strippedString** (string): `value` without `chars` at the + left-hand side + +```aql +--- +name: aqlLtrim_1 +description: '' +--- +RETURN LTRIM("foo bar") +``` + +```aql +--- +name: aqlLtrim_2 +description: '' +--- +RETURN LTRIM(" foo bar ") +``` + +```aql +--- +name: aqlLtrim_3 +description: '' +--- +RETURN LTRIM("--==[foo-bar]==--", "-=[]") +``` + +## MD5() + +`MD5(text) → hash` + +Calculate the MD5 checksum for `text` and return it in a hexadecimal +string representation. + +- **text** (string): a string +- returns **hash** (string): MD5 checksum as hex string + +**Examples** + +```aql +--- +name: aqlMd5 +description: '' +--- +RETURN MD5("foobar") +``` + +## NGRAM_POSITIONAL_SIMILARITY() + +`NGRAM_POSITIONAL_SIMILARITY(input, target, ngramSize) → similarity` + +Calculates the [_n_-gram similarity](https://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf) +between `input` and `target` using _n_-grams with minimum and maximum length of +`ngramSize`. + +The similarity is calculated by counting how long the longest sequence of +matching _n_-grams is, divided by the **longer argument's** total _n_-gram count. +Partially matching _n_-grams are counted, whereas +[`NGRAM_SIMILARITY()`](#ngram_similarity) counts only fully matching _n_-grams. + +The _n_-grams for both input and target are calculated on the fly, +not involving Analyzers. + +- **input** (string): source text to be tokenized into _n_-grams +- **target** (string): target text to be tokenized into _n_-grams +- **ngramSize** (number): minimum as well as maximum _n_-gram length +- returns **similarity** (number): value between `0.0` and `1.0` + +**Examples** + +```aql +--- +name: aqlNgramPositionalSimilarity +description: '' +--- +RETURN [ + NGRAM_POSITIONAL_SIMILARITY("quick fox", "quick foxx", 2), + NGRAM_POSITIONAL_SIMILARITY("quick fox", "quick foxx", 3), + NGRAM_POSITIONAL_SIMILARITY("quick fox", "quirky fox", 2), + NGRAM_POSITIONAL_SIMILARITY("quick fox", "quirky fox", 3) +] +``` + +## NGRAM_SIMILARITY() + +`NGRAM_SIMILARITY(input, target, ngramSize) → similarity` + +Calculates [_n_-gram similarity](https://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf) +between `input` and `target` using _n_-grams with minimum and maximum length of +`ngramSize`. + +The similarity is calculated by counting how long the longest sequence of +matching _n_-grams is, divided by **target's** total _n_-gram count. +Only fully matching _n_-grams are counted, whereas +[`NGRAM_POSITIONAL_SIMILARITY()`](#ngram_positional_similarity) counts partially +matching _n_-grams too. This behavior matches the similarity measure used in +[`NGRAM_MATCH()`](arangosearch.md#ngram_match). + +The _n_-grams for both input and target are calculated on the fly, not involving +Analyzers. + +- **input** (string): source text to be tokenized into _n_-grams +- **target** (string): target text to be tokenized into _n_-grams +- **ngramSize** (number): minimum as well as maximum _n_-gram length +- returns **similarity** (number): value between `0.0` and `1.0` + +**Examples** + +```aql +--- +name: aqlNgramSimilarity +description: '' +--- +RETURN [ + NGRAM_SIMILARITY("quick fox", "quick foxx", 2), + NGRAM_SIMILARITY("quick fox", "quick foxx", 3), + NGRAM_SIMILARITY("quick fox", "quirky fox", 2), + NGRAM_SIMILARITY("quick fox", "quirky fox", 3) +] +``` + +## RANDOM_TOKEN() + +`RANDOM_TOKEN(length) → randomString` + +Generate a pseudo-random token string with the specified length. +The algorithm for token generation should be treated as opaque. + +- **length** (number): desired string length for the token. It must be greater + or equal to 0 and at most 65536. A `length` of 0 returns an empty string. +- returns **randomString** (string): a generated token consisting of lowercase + letters, uppercase letters and numbers + +**Examples** + +```aql +--- +name: aqlRandomToken +description: '' +--- +RETURN [ + RANDOM_TOKEN(8), + RANDOM_TOKEN(8) +] +``` + +## REGEX_MATCHES() + +`REGEX_MATCHES(text, regex, caseInsensitive) → stringArray` + +Return the matches in the given string `text`, using the `regex`. + +- **text** (string): the string to search in +- **regex** (string): a [regular expression](#regular-expression-syntax) + to use for matching the `text` +- **caseInsensitive** (bool, *optional*): if set to `true`, the matching will be + case-insensitive. The default is `false`. +- returns **stringArray** (array): an array of strings containing the matches, + or `null` and a warning if the expression is invalid + +**Examples** + +```aql +--- +name: aqlRegexMatches_1 +description: '' +--- +RETURN REGEX_MATCHES("My-us3r_n4m3", "^[a-z0-9_-]{3,16}$", true) +``` + +```aql +--- +name: aqlRegexMatches_2 +description: '' +--- +RETURN REGEX_MATCHES("#4d82h4", "^#?([a-f0-9]{6}|[a-f0-9]{3})$", true) +``` + +```aql +--- +name: aqlRegexMatches_3 +description: '' +--- +RETURN REGEX_MATCHES("john@doe.com", "^([a-z0-9_\\\\.-]+)@([\\\\da-z-]+)\\\\.([a-z\\\\.]{2,6})$", false) +``` + +## REGEX_SPLIT() + +`REGEX_SPLIT(text, splitExpression, caseInsensitive, limit) → stringArray` + +Split the given string `text` into a list of strings at positions where +`splitExpression` matches. + +- **text** (string): the string to split +- **splitExpression** (string): a [regular expression](#regular-expression-syntax) + to use for splitting the `text`. You can define a capturing group to keep matches +- **caseInsensitive** (bool, *optional*): if set to `true`, the matching will be + case-insensitive. The default is `false`. +- **limit** (number, *optional*): limit the number of split values in the result. + If no `limit` is given, the number of splits returned is not bounded. +- returns **stringArray** (array): an array of strings, or `null` and a warning + if the expression is invalid + +**Examples** + +```aql +--- +name: aqlRegexSplit_1 +description: '' +--- +RETURN REGEX_SPLIT("This is a line.\\n This is yet another line\\r\\n This again is a line.\\r Mac line ", "\\\\.?\\r\\n|\\r|\\n") +``` + +```aql +--- +name: aqlRegexSplit_2 +description: '' +--- +RETURN REGEX_SPLIT("hypertext language, programming", "[\\\\s, ]+") +``` + +```aql +--- +name: aqlRegexSplit_3 +description: '' +--- +RETURN [ + REGEX_SPLIT("Capture the article", "(the)"), + REGEX_SPLIT("Don't capture the article", "the") +] +``` + +```aql +--- +name: aqlRegexSplit_4 +description: '' +--- +RETURN REGEX_SPLIT("cA,Bc,A,BcA,BcA,Bc", "a,b", true, 3) +``` + +## REGEX_TEST() + +`REGEX_TEST(text, search, caseInsensitive) → bool` + +Check whether the pattern `search` is contained in the string `text`, +using regular expression matching. + +- **text** (string): the string to search in +- **search** (string): a [regular expression](#regular-expression-syntax) + search pattern +- **caseInsensitive** (bool, *optional*): if set to `true`, the matching will be + case-insensitive. The default is `false`. +- returns **bool** (bool): `true` if the pattern is contained in `text`, + and `false` otherwise, or `null` and a warning if the expression is invalid + +**Examples** + +```aql +--- +name: aqlRegexTest_1 +description: '' +--- +RETURN REGEX_TEST("the quick brown fox", "the.*fox") +``` + +```aql +--- +name: aqlRegexTest_2 +description: '' +--- +RETURN REGEX_TEST("the quick brown fox", "^(a|the)\\\\s+(quick|slow).*f.x$") +``` + +```aql +--- +name: aqlRegexTest_3 +description: '' +--- +RETURN REGEX_TEST("the\\nquick\\nbrown\\nfox", "^the(\\n[a-w]+)+\\nfox$") +``` + +## REGEX_REPLACE() + +`REGEX_REPLACE(text, search, replacement, caseInsensitive) → string` + +Replace the pattern `search` with the string `replacement` in the string +`text`, using regular expression matching. + +- **text** (string): the string to search in +- **search** (string): a [regular expression](#regular-expression-syntax) + search pattern +- **replacement** (string): the string to replace the `search` pattern with +- **caseInsensitive** (bool, *optional*): if set to `true`, the matching will be + case-insensitive. The default is `false`. +- returns **string** (string): the string `text` with the `search` regex + pattern replaced with the `replacement` string wherever the pattern exists + in `text`, or `null` and a warning if the expression is invalid + +**Examples** + +```aql +--- +name: aqlRegexReplace_1 +description: '' +--- +RETURN REGEX_REPLACE("the quick brown fox", "the.*fox", "jumped over") +``` + +```aql +--- +name: aqlRegexReplace_2 +description: '' +--- +RETURN REGEX_REPLACE("An Avocado", "a", "_") +``` + +```aql +--- +name: aqlRegexReplace_3 +description: '' +--- +RETURN REGEX_REPLACE("An Avocado", "a", "_", true) +``` + +## REVERSE() + +`REVERSE(value) → reversedString` + +Return the reverse of the string `value`. + +- **value** (string): a string +- returns **reversedString** (string): a new string with the characters in + reverse order + +**Examples** + +```aql +--- +name: aqlReverse_1 +description: '' +--- +RETURN REVERSE("foobar") +``` + +```aql +--- +name: aqlReverse_2 +description: '' +--- +RETURN REVERSE("电脑坏了") +``` + +## RIGHT() + +`RIGHT(value, length) → substring` + +Return the `length` rightmost characters of the string `value`. + +To return the leftmost characters, see [`LEFT()`](#left).\ +To take a part from an arbitrary position off the string, +see [`SUBSTRING()`](#substring). + +- **value** (string): a string +- **length** (number): how many characters to return +- returns **substring** (string): at most `length` characters of `value`, + starting on the right-hand side of the string + +**Examples** + +```aql +--- +name: aqlRight_1 +description: '' +--- +RETURN RIGHT("foobar", 3) +``` + +```aql +--- +name: aqlRight_2 +description: '' +--- +RETURN RIGHT("foobar", 10) +``` + +## RTRIM() + +`RTRIM(value, chars) → strippedString` + +Return the string `value` with whitespace stripped from the end only. + +To strip from the start only, see [`LTRIM()`](#ltrim).\ +To strip both sides, see [`TRIM()`](#trim). + +- **value** (string): a string +- **chars** (string, *optional*): override the characters that should + be removed from the string. It defaults to `\r\n \t` (i.e. `0x0d`, `0x0a`, + `0x20` and `0x09`). +- returns **strippedString** (string): `value` without `chars` at the + right-hand side + +**Examples** + +```aql +--- +name: aqlRtrim_1 +description: '' +--- +RETURN RTRIM("foo bar") +``` + +```aql +--- +name: aqlRtrim_2 +description: '' +--- +RETURN RTRIM(" foo bar ") +``` + +```aql +--- +name: aqlRtrim_3 +description: '' +--- +RETURN RTRIM("--==[foo-bar]==--", "-=[]") +``` + +## SHA1() + +`SHA1(text) → hash` + +Calculate the SHA1 checksum for `text` and returns it in a hexadecimal +string representation. + +- **text** (string): a string +- returns **hash** (string): SHA1 checksum as hex string + +**Examples** + +```aql +--- +name: aqlSha1 +description: '' +--- +RETURN SHA1("foobar") +``` + +## SHA256() + +`SHA256(text) → hash` + +Calculate the SHA256 checksum for `text` and return it in a hexadecimal +string representation. + +- **text** (string): a string +- returns **hash** (string): SHA256 checksum as hex string + +**Examples** + +```aql +--- +name: aqlSha256 +description: '' +--- +RETURN SHA256("foobar") +``` + +## SHA512() + +`SHA512(text) → hash` + +Calculate the SHA512 checksum for `text` and return it in a hexadecimal +string representation. + +- **text** (string): a string +- returns **hash** (string): SHA512 checksum as hex string + +**Examples** + +```aql +--- +name: aqlSha512 +description: '' +--- +RETURN SHA512("foobar") +``` + +## SOUNDEX() + +`SOUNDEX(value) → soundexString` + +Return the [Soundex](https://en.wikipedia.org/wiki/Soundex) +fingerprint of `value`. + +- **value** (string): a string +- returns **soundexString** (string): a Soundex fingerprint of `value` + +**Examples** + +```aql +--- +name: aqlSoundex +description: '' +--- +RETURN [ + SOUNDEX("example"), + SOUNDEX("ekzampul"), + SOUNDEX("soundex"), + SOUNDEX("sounteks") +] +``` + +## SPLIT() + +`SPLIT(value, separator, limit) → strArray` + +Split the given string `value` into a list of strings, using the `separator`. + +To split a document identifier (`_id`) into the collection name and document key +(`_key`), you should use the more optimized +[`PARSE_IDENTIFIER()` function](document-object.md#parse_identifier). + +- **value** (string): a string +- **separator** (string): either a string or a list of strings. If `separator` is + an empty string, `value` will be split into a list of characters. If no `separator` + is specified, `value` will be returned as array. +- **limit** (number, *optional*): limit the number of split values in the result. + If no `limit` is given, the number of splits returned is not bounded. +- returns **strArray** (array): an array of strings + +**Examples** + +```aql +--- +name: aqlSplit_1 +description: '' +--- +RETURN SPLIT( "foo-bar-baz", "-" ) +``` + +```aql +--- +name: aqlSplit_2 +description: '' +--- +RETURN SPLIT( "foo-bar-baz", "-", 1 ) +``` + +```aql +--- +name: aqlSplit_3 +description: '' +--- +RETURN SPLIT( "foo, bar & baz", [ ", ", " & " ] ) +``` + +## STARTS_WITH() + +`STARTS_WITH(text, prefix) → startsWith` + +Check whether the given string starts with `prefix`. + +There is a corresponding [`STARTS_WITH()` ArangoSearch function](arangosearch.md#starts_with) +that can utilize View indexes. + +- **text** (string): a string to compare against +- **prefix** (string): a string to test for at the start of the text +- returns **startsWith** (bool): whether the text starts with the given prefix + +**Examples** + +```aql +--- +name: aqlStartsWith_1 +description: '' +--- +RETURN STARTS_WITH("foobar", "foo") +``` + +```aql +--- +name: aqlStartsWith_2 +description: '' +--- +RETURN STARTS_WITH("foobar", "baz") +``` + +--- + +`STARTS_WITH(text, prefixes, minMatchCount) → startsWith` + +Check if the given string starts with one of the `prefixes`. + +- **text** (string): a string to compare against +- **prefixes** (array): an array of strings to test for at the start of the text +- **minMatchCount** (number, _optional_): minimum number of prefixes that + should be satisfied. The default is `1` and it is the only meaningful value + unless `STARTS_WITH()` is used in the context of a `SEARCH` expression where + an attribute can have multiple values at the same time +- returns **startsWith** (bool): whether the text starts with at least + *minMatchCount* of the given prefixes + +**Examples** + +```aql +--- +name: aqlStartsWith_3 +description: '' +--- +RETURN STARTS_WITH("foobar", ["bar", "foo"]) +``` + +```aql +--- +name: aqlStartsWith_4 +description: '' +--- +RETURN STARTS_WITH("foobar", ["bar", "baz"]) +``` + +## SUBSTITUTE() + +`SUBSTITUTE(value, search, replace, limit) → substitutedString` + +Replace search values in the string `value`. + +- **value** (string): a string +- **search** (string\|array): if `search` is a string, all occurrences of + `search` will be replaced in `value`. If `search` is an array of strings, + each occurrence of a value contained in `search` will be replaced by the + corresponding array element in `replace`. If `replace` has less list items + than `search`, occurrences of unmapped `search` items will be replaced by an + empty string. +- **replace** (string\|array, *optional*): a replacement string, or an array of + strings to replace the corresponding elements of `search` with. Can have less + elements than `search` or be left out to remove matches. If `search` is an array + but `replace` is a string, then all matches will be replaced with `replace`. +- **limit** (number, *optional*): cap the number of replacements to this value +- returns **substitutedString** (string): a new string with matches replaced + (or removed) + +**Examples** + +```aql +--- +name: aqlSubstitute_1 +description: '' +--- +RETURN SUBSTITUTE( "the quick brown foxx", "quick", "lazy" ) +``` + +```aql +--- +name: aqlSubstitute_2 +description: '' +--- +RETURN SUBSTITUTE( "the quick brown foxx", [ "quick", "foxx" ], [ "slow", "dog" ] ) +``` + +```aql +--- +name: aqlSubstitute_3 +description: '' +--- +RETURN SUBSTITUTE( "the quick brown foxx", [ "the", "foxx" ], [ "that", "dog" ], 1 ) +``` + +```aql +--- +name: aqlSubstitute_4 +description: '' +--- +RETURN SUBSTITUTE( "the quick brown foxx", [ "the", "quick", "foxx" ], [ "A", "VOID!" ] ) +``` + +```aql +--- +name: aqlSubstitute_5 +description: '' +--- +RETURN SUBSTITUTE( "the quick brown foxx", [ "quick", "foxx" ], "xx" ) +``` + +--- + +`SUBSTITUTE(value, mapping, limit) → substitutedString` + +Alternatively, `search` and `replace` can be specified in a combined value. + +- **value** (string): a string +- **mapping** (object): a lookup map with search strings as keys and replacement + strings as values. Empty strings and `null` as values remove matches. + Note that there is no defined order in which the mapping is processed. In case + of overlapping searches and substitutions, one time the first entry may win, + another time the second. If you need to ensure a specific order then choose + the array-based variant of this function +- **limit** (number, *optional*): cap the number of replacements to this value +- returns **substitutedString** (string): a new string with matches replaced + (or removed) + +**Examples** + +```aql +--- +name: aqlSubstitute_6 +description: '' +--- +RETURN SUBSTITUTE("the quick brown foxx", { + "quick": "small", + "brown": "slow", + "foxx": "ant" +}) +``` + +```aql +--- +name: aqlSubstitute_7 +description: '' +--- +RETURN SUBSTITUTE("the quick brown foxx", { + "quick": "", + "brown": null, + "foxx": "ant" +}) +``` + +```aql +--- +name: aqlSubstitute_8 +description: '' +--- +RETURN SUBSTITUTE("the quick brown foxx", { + "quick": "small", + "brown": "slow", + "foxx": "ant" +}, 2) +``` + +## SUBSTRING() + +`SUBSTRING(value, offset, length) → substring` + +Return a substring of `value`. + +To return the rightmost characters, see [`RIGHT()`](#right).\ +To return the leftmost characters, see [`LEFT()`](#left). + +- **value** (string): a string +- **offset** (number): start at this character of the string. Offsets start at 0. + Negative offsets start from the end of the string. The last character has an + index of -1 +- **length** (number, *optional*): take this many characters. Omit the parameter + to get the substring from `offset` to the end of the string +- returns **substring** (string): a substring of `value` + +**Examples** + +Get a substring starting at the 6th character and until the end of the string: + +```aql +--- +name: aqlSubstring_1 +description: '' +--- +RETURN SUBSTRING("Holy Guacamole!", 5) +``` + +Get a 4 characters long substring, starting at the 11th character: + +```aql +--- +name: aqlSubstring_2 +description: '' +--- +RETURN SUBSTRING("Holy Guacamole!", 10, 4) +``` + +Get a 4 characters long substring, starting at the 5th from last character: + +```aql +--- +name: aqlSubstring_3 +description: '' +--- +RETURN SUBSTRING("Holy Guacamole!", -5, 4) +``` + +## SUBSTRING_BYTES() + +`SUBSTRING_BYTES(value, offset, length) → substring` + +Return a substring of `value`, using an `offset` and `length` in bytes instead +of in number of characters. + +This function is intended to be used together with the +[`OFFSET_INFO()` function](arangosearch.md#offset_info) for +[search highlighting](../../index-and-search/arangosearch/search-highlighting.md). + +- **value** (string): a string +- **offset** (number): start at this byte of the UTF-8 encoded string. + Offsets start at 0. Negative offsets start from the end of the string. + The last byte has an index of -1. The offset needs to coincide with the + beginning of a character's byte sequence +- **length** (number, *optional*): take this many bytes. Omit the parameter to + get the substring from `offset` to the end of the string. The end byte + (`offset` + `length`) needs to coincide with the end of a character's + byte sequence +- returns **substring** (string\|null): a substring of `value`, or `null` and + produces a warning if the start or end byte is in the middle of a character's + byte sequence + +**Examples** + +Get a substring starting at the 11th byte and until the end of the string. +Note that the heart emoji is comprised of two characters, the Black Heart Symbol +and the Variation Selector-16, each encoded using 3 bytes in UTF-8: + +```aql +--- +name: aqlSubstringBytes_1 +description: '' +--- +RETURN SUBSTRING_BYTES("We ❤️ avocado!", 10) +``` + +Get a 3 bytes long substring starting at the 3rd byte, extracting the +Black Heart Symbol: + +```aql +--- +name: aqlSubstringBytes_2 +description: '' +--- +RETURN SUBSTRING_BYTES("We ❤️ avocado!", 3, 3) +``` + +Get a 6 bytes long substring starting at the 15th byte from last, extracting the +heart emoji: + +```aql +--- +name: aqlSubstringBytes_3 +description: '' +--- +RETURN SUBSTRING_BYTES("We ❤️ avocado!", -15, 6) +``` + +Try to get a 4 bytes long substring starting at the 15th byte from last, +resulting in a `null` value and a warning because the substring contains an +incomplete UTF-8 byte sequence: + +```aql +--- +name: aqlSubstringBytes_4 +description: '' +--- +RETURN SUBSTRING_BYTES("We ❤️ avocado!", -15, 4) +``` + +## TOKENS() + +`TOKENS(input, analyzer) → tokenArray` + +Split the `input` string(s) with the help of the specified `analyzer` into an +array. The resulting array can be used in `FILTER` or `SEARCH` statements with +the `IN` operator, but also be assigned to variables and returned. This can be +used to better understand how a specific Analyzer processes an input value. + +It has a regular return value unlike all other ArangoSearch AQL functions and +is thus not limited to `SEARCH` operations. It is independent of Views. +A wrapping `ANALYZER()` call in a search expression does not affect the +`analyzer` argument nor allow you to omit it. + +- **input** (string\|array): text to tokenize. Accepts recursive arrays of + strings. +- **analyzer** (string): name of an [Analyzer](../../index-and-search/analyzers.md). +- returns **tokenArray** (array): array of strings with zero or more elements, + each element being a token. + +**Examples** + +Example query showcasing the `"text_de"` Analyzer (tokenization with stemming, +case conversion and accent removal for German text): + +```aql +--- +name: aqlTokens_1 +description: '' +--- +RETURN TOKENS("Lörem ipsüm, DOLOR SIT Ämet.", "text_de") +``` + +To search a View for documents where the `text` attribute contains certain +words/tokens in any order, you can use the function like this: + +```aql +FOR doc IN viewName + SEARCH ANALYZER(doc.text IN TOKENS("dolor amet lorem", "text_en"), "text_en") + RETURN doc +``` + +It will match `{ "text": "Lorem ipsum, dolor sit amet." }` for instance. If you +want to search for tokens in a particular order, use +[`PHRASE()`](arangosearch.md#phrase) instead. + +If an array of strings is passed as first argument, then each string is +tokenized individually and an array with the same nesting as the input array +is returned: + +```aql +--- +name: aqlTokens_2 +description: '' +--- +RETURN TOKENS("quick brown fox", "text_en") +``` + +```aql +--- +name: aqlTokens_3 +description: '' +--- +RETURN TOKENS(["quick brown", "fox"], "text_en") +``` + +```aql +--- +name: aqlTokens_4 +description: '' +--- +RETURN TOKENS(["quick brown", ["fox"]], "text_en") +``` + +In most cases you will want to flatten the resulting array for further usage, +because nested arrays are not accepted in `SEARCH` statements such as +` ALL IN doc.`: + +```aql +LET tokens = TOKENS(["quick brown", ["fox"]], "text_en") // [ ["quick", "brown"], [["fox"]] ] +LET tokens_flat = FLATTEN(tokens, 2) // [ "quick", "brown", "fox" ] +FOR doc IN myView SEARCH ANALYZER(tokens_flat ALL IN doc.title, "text_en") RETURN doc +``` + +## TO_BASE64() + +`TO_BASE64(value) → encodedString` + +Return the Base64 representation of `value`. + +- **value** (string): a string +- returns **encodedString** (string): a Base64 representation of `value` + +**Examples** + +```aql +--- +name: aqlToBase64 +description: '' +--- +RETURN [ + TO_BASE64("ABC."), + TO_BASE64("123456") +] +``` + +## TO_HEX() + +`TO_HEX(value) → hexString` + +Return the hexadecimal representation of `value`. + +- **value** (string): a string +- returns **hexString** (string): a hexadecimal representation of `value` + +**Examples** + +```aql +--- +name: aqlToHex +description: '' +--- +RETURN [ + TO_HEX("ABC."), + TO_HEX("ü") +] +``` + +## TRIM() + +`TRIM(value, type) → strippedString` + +Return the string `value` with whitespace stripped from the start and/or end. + +The optional `type` parameter specifies from which parts of the string the +whitespace is stripped. [`LTRIM()`](#ltrim) and [`RTRIM()`](#rtrim) are preferred +however. + +- **value** (string): a string +- **type** (number, *optional*): strip whitespace from the + - `0` – start and end of the string (default) + - `1` – start of the string only + - `2` – end of the string only + +--- + +`TRIM(value, chars) → strippedString` + +Return the string `value` with whitespace stripped from the start and end. + +- **value** (string): a string +- **chars** (string, *optional*): override the characters that should + be removed from the string. It defaults to `\r\n \t` (i.e. `0x0d`, `0x0a`, + `0x20` and `0x09`). +- returns **strippedString** (string): `value` without `chars` on both sides + +**Examples** + +```aql +--- +name: aqlTrim_1 +description: '' +--- +RETURN TRIM("foo bar") +``` + +```aql +--- +name: aqlTrim_2 +description: '' +--- +RETURN TRIM(" foo bar ") +``` + +```aql +--- +name: aqlTrim_3 +description: '' +--- +RETURN TRIM("--==[foo-bar]==--", "-=[]") +``` + +```aql +--- +name: aqlTrim_4 +description: '' +--- +RETURN TRIM(" foobar\\t \\r\\n ") +``` + +```aql +--- +name: aqlTrim_5 +description: '' +--- +RETURN TRIM(";foo;bar;baz, ", ",; ") +``` + +## UPPER() + +`UPPER(value) → upperCaseString` + +Convert lower-case letters in `value` to their upper-case counterparts. +All other characters are returned unchanged. + +- **value** (string): a string +- returns **upperCaseString** (string): `value` with lower-case characters converted + to upper-case characters + +**Examples** + +```aql +--- +name: aqlUpper +description: '' +--- +RETURN UPPER("AVOcado") +``` + +## UUID() + +`UUID() → UUIDString` + +Return a universally unique identifier value. + +- returns **UUIDString** (string): a universally unique identifier + +**Examples** + +```aql +--- +name: aqlUuid +description: '' +--- +FOR i IN 1..3 + RETURN UUID() +``` + +## Regular Expression Syntax + +A regular expression may consist of literal characters and the following +characters and sequences: + +- `.` – the dot matches any single character except line terminators. + To include line terminators, use `[\s\S]` instead to simulate `.` with *DOTALL* flag. +- `\d` – matches a single digit, equivalent to `[0-9]` +- `\s` – matches a single whitespace character +- `\S` – matches a single non-whitespace character +- `\b` – matches a word boundary. This match is zero-length +- `\B` – Negation of `\b`. The match is zero-length +- `[xyz]` – set of characters. Matches any of the enclosed characters + (here: *x*, *y*, or *z*) +- `[^xyz]` – negated set of characters. Matches any other character than the + enclosed ones (i.e. anything but *x*, *y*, or *z* in this case) +- `[x-z]` – range of characters. Matches any of the characters in the + specified range, e.g. `[0-9A-F]` to match any character in + *0123456789ABCDEF* +- `[^x-z]` – negated range of characters. Matches any other character than the + ones specified in the range +- `(xyz)` – defines and matches a pattern group. Also defines a capturing group. +- `(?:xyz)` – defines and matches a pattern group without capturing the match +- `(xy|z)` – matches either *xy* or *z* +- `^` – matches the beginning of the string (e.g. `^xyz`) +- `$` – matches the end of the string (e.g. `xyz$`) + +To literally match one of the characters that have a special meaning in regular +expressions (`.`, `*`, `?`, `[`, `]`, `(`, `)`, `{`, `}`, `^`, `$`, and `\`) +you may need to escape the character with a backslash, which typically requires +escaping itself. The backslash of shorthand character classes like `\d`, `\s`, +and `\b` counts as literal backslash. The backslash of JSON escape sequences +like `\t` (tabulation), `\r` (carriage return), and `\n` (line feed) does not, +however. + +{{< info >}} +Literal backlashes require different amounts of escaping depending on the +context: +- `\` in bind variables (_Table_ view mode) in the web interface (automatically + escaped to `\\` unless the value is wrapped in double quotes and already + escaped properly) +- `\\` in bind variables (_JSON_ view mode) and queries in the web interface +- `\\` in bind variables in arangosh +- `\\\\` in queries in arangosh +- Double the amount compared to arangosh in shells that use backslashes for +escaping (`\\\\` in bind variables and `\\\\\\\\` in queries) +{{< /info >}} + +Characters and sequences may optionally be repeated using the following +quantifiers: + +- `x?` – matches one or zero occurrences of *x* +- `x*` – matches zero or more occurrences of *x* (greedy) +- `x+` – matches one or more occurrences of *x* (greedy) +- `x*?` – matches zero or more occurrences of *x* (non-greedy) +- `x+?` – matches one or more occurrences of *x* (non-greedy) +- `x{y}` – matches exactly *y* occurrences of *x* +- `x{y,z}` – matches between *y* and *z* occurrences of *x* +- `x{y,}` – matches at least *y* occurrences of *x* + +Note that `xyz+` matches *xyzzz*, but if you want to match *xyzxyz* instead, +you need to define a pattern group by wrapping the sub-expression in parentheses +and place the quantifier right behind it, like `(xyz)+`. diff --git a/site/content/arangodb/oem/aql/functions/type-check-and-cast.md b/site/content/arangodb/oem/aql/functions/type-check-and-cast.md new file mode 100644 index 0000000000..81b3bb9870 --- /dev/null +++ b/site/content/arangodb/oem/aql/functions/type-check-and-cast.md @@ -0,0 +1,279 @@ +--- +title: Type check and cast functions in AQL +menuTitle: Type check & cast +weight: 55 +description: >- + AQL provides functions for checking data types and converting between + different types +--- +Some operators expect their operands to have a certain data type. For example, +logical operators expect their operands to be boolean values, and the arithmetic +operators expect their operands to be numeric values. If an operation is performed +with operands of other types, an automatic conversion to the expected types is +tried. This is called implicit type casting. It helps to avoid query +aborts. + +Type casts can also be performed upon request by invoking a type cast function. +This is called explicit type casting. AQL offers several functions for this. +Each of the these functions takes an operand of any data type and returns a result +value with the type corresponding to the function name. For example, `TO_NUMBER()` +returns a numeric value. + +## Type casting functions + +### TO_BOOL() + +`TO_BOOL(value) → bool` + +Take an input *value* of any type and convert it into the appropriate +boolean value. + +- **value** (any): input of arbitrary type +- returns **bool** (boolean): + - *null* is converted to *false* + - Numbers are converted to *true*, except for 0, which is converted to *false* + - Strings are converted to *true* if they are non-empty, and to *false* otherwise + - Arrays are always converted to *true* (even if empty) + - Objects / documents are always converted to *true* + +It's also possible to use double negation to cast to boolean: + +```aql +!!1 // true +!!0 // false +!!-0.0 // false +not not 1 // true +!!"non-empty string" // true +!!"" // false +``` + +`TO_BOOL()` is preferred however, because it states the intention clearer. + +### TO_NUMBER() + +`TO_NUMBER(value) → number` + +Take an input *value* of any type and convert it into a numeric value. + +- **value** (any): input of arbitrary type +- returns **number** (number): + - *null* and *false* are converted to the value *0* + - *true* is converted to *1* + - Numbers keep their original value + - Strings are converted to their numeric equivalent if the string contains a + valid representation of a number. Whitespace at the start and end of the string + is allowed. String values that do not contain any valid representation of a number + will be converted to the number *0*. + - An empty array is converted to *0*, an array with one member is converted into the + result of `TO_NUMBER()` for its sole member. An array with two or more members is + converted to the number *0*. + - An object / document is converted to the number *0*. + - A unary plus will also cast to a number, but `TO_NUMBER()` is the preferred way: + ```aql + +'5' // 5 + +[8] // 8 + +[8,9] // 0 + +{} // 0 + ``` + - A unary minus works likewise, except that a numeric value is also negated: + ```aql + -'5' // -5 + -[8] // -8 + -[8,9] // 0 + -{} // 0 + ``` + +### TO_STRING() + +`TO_STRING(value) → str` + +Take an input *value* of any type and convert it into a string value. + +- **value** (any): input of arbitrary type +- returns **str** (string): + - *null* is converted to an empty string `""` + - *false* is converted to the string *"false"*, *true* to the string *"true"* + - Numbers are converted to their string representations. This can also be a + scientific notation (e.g. "2e-7") + - Arrays and objects / documents are converted to string representations, + which means JSON-encoded strings with no additional whitespace + +```aql +TO_STRING(null) // "" +TO_STRING(true) // "true" +TO_STRING(false) // "false" +TO_STRING(123) // "123" +TO_STRING(+1.23) // "1.23" +TO_STRING(-1.23) // "-1.23" +TO_STRING(0.0000002) // "2e-7" +TO_STRING( [1, 2, 3] ) // "[1,2,3]" +TO_STRING( { foo: "bar", baz: null } ) // "{\"foo\":\"bar\",\"baz\":null}" +``` + +### TO_ARRAY() + +`TO_ARRAY(value) → array` + +Take an input *value* of any type and convert it into an array value. + +- **value** (any): input of arbitrary type +- returns **array** (array): + - *null* is converted to an empty array + - Boolean values, numbers and strings are converted to an array containing + the original value as its single element + - Arrays keep their original value + - Objects / documents are converted to an array containing their attribute + **values** as array elements, just like [`VALUES()`](document-object.md#values) + +```aql +TO_ARRAY(null) // [] +TO_ARRAY(false) // [false] +TO_ARRAY(true) // [true] +TO_ARRAY(5) // [5] +TO_ARRAY("foo") // ["foo"] +TO_ARRAY([1, 2, "foo"]) // [1, 2, "foo"] +TO_ARRAY({foo: 1, bar: 2, baz: [3, 4, 5]}) // [1, 2, [3, 4, 5]] +``` + +### TO_LIST() + +`TO_LIST(value) → array` + +This is an alias for [`TO_ARRAY()`](#to_array). + +## Type check functions + +AQL also offers functions to check the data type of a value at runtime. The +following type check functions are available. Each of these functions takes an +argument of any data type and returns true if the value has the type that is +checked for, and false otherwise. + +### IS_NULL() + +`IS_NULL(value) → bool` + +Check whether *value* is *null*. Identical to `value == null`. + +To test if an attribute exists, see [`HAS()`](document-object.md#has) instead. + +- **value** (any): value to test +- returns **bool** (boolean): *true* if *value* is `null`, + *false* otherwise + +### IS_BOOL() + +`IS_BOOL(value) → bool` + +Check whether *value* is a *boolean* value + +- **value** (any): value to test +- returns **bool** (boolean): *true* if *value* is `true` or `false`, + *false* otherwise + +### IS_NUMBER() + +`IS_NUMBER(value) → bool` + +Check whether *value* is a number + +- **value** (any): value to test +- returns **bool** (boolean): *true* if *value* is a number, + *false* otherwise + +### IS_STRING() + +`IS_STRING(value) → bool` + +Check whether *value* is a string + +- **value** (any): value to test +- returns **bool** (boolean): *true* if *value* is a string, + *false* otherwise + +### IS_ARRAY() + +`IS_ARRAY(value) → bool` + +Check whether *value* is an array / list + +- **value** (any): value to test +- returns **bool** (boolean): *true* if *value* is an array / list, + *false* otherwise + +### IS_LIST() + +`IS_LIST(value) → bool` + +This is an alias for [`IS_ARRAY()`](#is_array) + +### IS_OBJECT() + +`IS_OBJECT(value) → bool` + +Check whether *value* is an object / document + +- **value** (any): value to test +- returns **bool** (boolean): *true* if *value* is an object / document, + *false* otherwise + +### IS_DOCUMENT() + +`IS_DOCUMENT(value) → bool` + +This is an alias for [`IS_OBJECT()`](#is_object) + +### IS_DATESTRING() + +`IS_DATESTRING(str) → bool` + +Check whether *value* is a string that can be used in a date function. +This includes partial dates such as *"2015"* or *"2015-10"* and strings +containing properly formatted but invalid dates such as *"2015-02-31"*. + +- **str** (string): date string to test +- returns **bool** (boolean): *true* if *str* is a correctly formatted date string, + *false* otherwise including all non-string values, even if some of them may be usable + in date functions (numeric timestamps) + +### IS_IPV4() + +See [String Functions](string.md#is_ipv4). + +### IS_KEY() + +`IS_KEY(str) → bool` + +Check whether *value* is a string that can be used as a +document key, i.e. as the value of the *_key* attribute. +See [Document keys](../../concepts/data-structure/documents/_index.md#document-keys). + +- **str** (string): document key to test +- returns **bool** (boolean): whether *str* can be used as document key + +### TYPENAME() + +`TYPENAME(value) → typeName` + +Return the data type name of *value*. + +- **value** (any): input of arbitrary type +- returns **typeName** (string): data type name of *value* + (`"null"`, `"bool"`, `"number"`, `"string"`, `"array"` or `"object"`) + +Example Value | Data Type Name +---------------:|--------------- +`null` | `"null"` +`true` | `"bool"` +`false` | `"bool"` +`123` | `"number"` +`-4.56` | `"number"` +`0` | `"number"` +`"foobar"` | `"string"` +`"123"` | `"string"` +`""` | `"string"` +`[ 1, 2, 3 ]` | `"array"` +`["foo",true]` | `"array"` +`[ ]` | `"array"` +`{"foo":"bar"}` | `"object"` +`{"foo": null}` | `"object"` +`{ }` | `"object"` diff --git a/site/content/arangodb/oem/aql/fundamentals/_index.md b/site/content/arangodb/oem/aql/fundamentals/_index.md new file mode 100644 index 0000000000..1d1089e0a4 --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/_index.md @@ -0,0 +1,8 @@ +--- +title: AQL Fundamentals +menuTitle: Fundamentals +weight: 10 +description: >- + Learn about the core aspects of ArangoDB's query language, like the structure + of queries, the available data types, as well as result and error handling +--- diff --git a/site/content/arangodb/oem/aql/fundamentals/accessing-data-from-collections.md b/site/content/arangodb/oem/aql/fundamentals/accessing-data-from-collections.md new file mode 100644 index 0000000000..a757e89208 --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/accessing-data-from-collections.md @@ -0,0 +1,78 @@ +--- +title: Accessing data from collections with AQL +menuTitle: Accessing data from collections +weight: 25 +description: >- + You can access collection data by looping over a collection and reading + document attributes, with non-existing attributes returning a `null` value +--- +A collection can be thought of as an array of documents. To access the documents, +use a [`FOR` operation](../high-level-operations/for.md) to iterate over a +collection using its name, like `FOR doc IN collection ...`. + +Note that when iterating over a collection, the order of documents is undefined. +To establish an explicit and deterministic order for the documents, use a +[`SORT` operation](../high-level-operations/sort.md) in addition. + +Data in collections is stored in documents, which are JSON objects. Each document +potentially has different attributes than other documents. This is true even for +documents of the same collection. + +It is therefore quite normal to encounter documents that do not have some or all +of the attributes that are queried in an AQL query. In this case, the +non-existing attributes in the document are treated as if they would exist +with a value of `null`. This means that comparing a document attribute to +`null` returns `true` if the document has the particular attribute and the +attribute has a value of `null`, or that the document does not have the +particular attribute at all. + +For example, the following query returns all documents from the collection +`users` that have a value of `null` in the attribute `name`, plus all documents +from `users` that do not have the `name` attribute at all: + +```aql +FOR u IN users + FILTER u.name == null + RETURN u +``` + +Furthermore, `null` is less than any other value (excluding `null` itself). That +means documents with non-existing attributes may be included in the result +when comparing attribute values with the less than or less equal operators. + +For example, the following query returns all documents from the collection +`users` that have an attribute `age` with a value less than `39`, but also all +documents from the collection that do not have the attribute `age` at all. + +```aql +FOR u IN users + FILTER u.age < 39 + RETURN u +``` + +This behavior should always be taken into account when writing queries. + +To distinguish between an explicit `null` value and the implicit `null` value +you get if you access a non-existent attribute, you can use the +[`HAS()` function](../functions/document-object.md#has). The following query +only returns documents that have a `name` attribute with a `null` value: + +```aql +FOR u IN users + FILTER u.name == null AND HAS(u, "name") + RETURN u +``` + +To exclude implicit as well as explicit `null` values in a query that uses +`<` or `<=` comparison operators to limit the upper bound, you can add a check +for the lower bound: + +```aql +FOR u IN users + FILTER u.age > null AND u.age < 39 + // or potentially + //FILTER u.age >= 0 AND u.age < 39 + // which can be replaced with + //FILTER RANGE(u.age, 0, 39, true, false) + RETURN u +``` diff --git a/site/content/arangodb/oem/aql/fundamentals/bind-parameters.md b/site/content/arangodb/oem/aql/fundamentals/bind-parameters.md new file mode 100644 index 0000000000..4bb29ea3fb --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/bind-parameters.md @@ -0,0 +1,169 @@ +--- +title: Bind parameters in AQL +menuTitle: Bind Parameters +weight: 15 +description: >- + Bind parameters allow you to separate the query logic from literal values used + in the query and safely use user-provided input for these placeholders +--- +It is good practice to separate the query text from the literal values because +it prevents (malicious) injection of keywords and other collection names into an +existing query. This injection would be dangerous because it may change the +meaning of an existing query. + +Using bind parameters, the meaning of an existing query cannot be changed. +Bind parameters can be used everywhere in a query where literals can be used. +This lets you turn literals into a sort of variables to reuse the same query +with different parameterization. + +## Syntax + +The general syntax for bind parameters is `@name` where `@` signifies that this +is a value bind parameter and *name* is the actual parameter name. It can be +used to substitute values in a query. + +```aql +RETURN @value +``` + +For collections, there is a slightly different syntax `@@coll` where `@@` +signifies that it is a collection bind parameter and *coll* is the parameter +name. + +```aql +FOR doc IN @@coll + RETURN doc +``` + +Keywords and other language constructs cannot be replaced by bind values, such +as `FOR`, `FILTER`, `IN`, `INBOUND` or function calls. + +Bind parameter names must start with any of the letters *a* to *z* (upper or +lower case) or a digit (*0* to *9*), and can be followed by any letter, digit +or the underscore symbol. + +They must not be quoted in the query code: + +```aql +FILTER u.name == "@name" // wrong +FILTER u.name == @name // correct +``` + +```aql +FOR doc IN "@@collection" // wrong +FOR doc IN @@collection // correct +``` + +If you need to do string processing (concatenation, etc.) in the query, you +need to use [string functions](../functions/string.md) to do so: + +```aql +FOR u IN users + FILTER u.id == CONCAT('prefix', @id, 'suffix') && u.name == @name + RETURN u +``` + +## Usage + +### General + +The bind parameter values need to be passed along with the query when it is +executed, but not as part of the query text itself. In the web interface, +there is a pane next to the query editor where the bind parameters can be +entered. For below query, two input fields will show up to enter values for +the parameters `id` and `name`. + +```aql +FOR u IN users + FILTER u.id == @id && u.name == @name + RETURN u +``` + +When using `db._query()` (in arangosh for instance), then an +object of key-value pairs can be passed for the parameters. Such an object +can also be passed to the HTTP API endpoint `_api/cursor`, as attribute +value for the key `bindVars`: + +```json +{ + "query": "FOR u IN users FILTER u.id == @id && u.name == @name RETURN u", + "bindVars": { + "id": 123, + "name": "John Smith" + } +} +``` + +Bind parameters that are declared in the query must also be passed a parameter +value, or the query will fail. Specifying parameters that are not declared in +the query will result in an error too. + +Specific information about parameters binding can also be found in: + +- [AQL with Web Interface](../how-to-invoke-aql/with-the-web-interface.md) +- [AQL with _arangosh_](../how-to-invoke-aql/with-arangosh.md) +- [HTTP interface for AQL queries](../../develop/http-api/queries/aql-queries.md) + +### Nested attributes + +Bind parameters can be used for both, the dot notation as well as the square +bracket notation for sub-attribute access. They can also be chained: + +```aql +LET doc = { foo: { bar: "baz" } } + +RETURN doc.@attr.@subattr +// or +RETURN doc[@attr][@subattr] +``` + +```json +{ + "attr": "foo", + "subattr": "bar" +} +``` + +Both variants in above example return `[ "baz" ]` as query result. + +The whole attribute path, for highly nested data in particular, can also be +specified using the dot notation and a single bind parameter, by passing an +array of strings as parameter value. The elements of the array represent the +attribute keys of the path: + +```aql +LET doc = { a: { b: { c: 1 } } } +RETURN doc.@attr +``` + +```json +{ "attr": [ "a", "b", "c" ] } +``` + +The example query returns `[ 1 ]` as result. Note that `{ "attr": "a.b.c" }` +would return the value of an attribute called `a.b.c`, not the value of +attribute `c` with the parents `a` and `b` as `[ "a", "b", "c" ]` would. + +### Collection bind parameters + +A special type of bind parameter exists for injecting collection names. This +type of bind parameter has a name prefixed with an additional `@` symbol, so +`@@name` in the query. + +```aql +FOR u IN @@collection + FILTER u.active == true + RETURN u +``` + +The second `@` will be part of the bind parameter name, which is important to +remember when specifying the `bindVars` (note the leading `@`): + +```json +{ + "query": "FOR u IN @@collection FILTER u.active == true RETURN u", + "bindVars": { + "@collection": "users" + } +} +``` diff --git a/site/content/arangodb/oem/aql/fundamentals/data-types.md b/site/content/arangodb/oem/aql/fundamentals/data-types.md new file mode 100644 index 0000000000..51719ddb7c --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/data-types.md @@ -0,0 +1,290 @@ +--- +title: Data types in AQL +menuTitle: Data types +weight: 10 +description: >- + AQL supports both _primitive_ data types consisting of exactly one value and + _compound_ data types comprised of multiple values +--- +The following types are available: + +| Data type | Description | +|------------:|-------------| +| **null** | An empty value, also: the absence of a value +| **boolean** | Boolean truth value with possible values *false* and *true* +| **number** | Signed (real) number +| **string** | UTF-8 encoded text value +| **array** / list | Sequence of values, referred to by their positions +| **object** / document | Sequence of values, referred to by their names + +## Primitive types + +### Null value + +A `null` value can be used to represent an empty or absent value. +It is different from a numerical value of zero (`null != 0`) and other +*falsy* values (`false` or a zero-length string `""`). +It is also known as *nil* or *None* in other languages. + +The system may return `null` in the absence of value, for example +if you call a [function](../functions/_index.md) with unsupported values +as arguments or if you try to [access an attribute](accessing-data-from-collections.md) +which does not exist. + +### Boolean data type + +The Boolean data type has two possible values, `true` and `false`. +They represent the two truth values in logic and mathematics. + +### Numeric literals + +Numeric literals can be integers or real values (floating-point numbers). +They can optionally be signed with the `+` or `-` symbols. +A decimal point `.` is used as separator for the optional fractional part. +The scientific notation (*E-notation*) is also supported. + +``` + 1 + +1 + 42 + -1 +-42 + 1.23 +-99.99 + 0.5 + .5 + -4.87e103 + -4.87E103 +``` + +The following notations are invalid and will throw a syntax error: + +``` + 1. +01.23 +00.23 +00 +``` + +All numeric values are treated as 64-bit signed integer or 64-bit +double-precision floating point values internally. The internal floating-point +format used is IEEE 754. + +{{< warning >}} +When exposing any numeric integer values to JavaScript via +[user-defined AQL functions](../user-defined-functions.md), numbers that exceed 32 bit +precision are converted to floating-point values, so large integers can lose +some bits of precision. The same is true when converting AQL numeric results to +JavaScript (e.g. returning them to Foxx). +{{< /warning >}} + +Numeric integer literals can also be expressed as binary +(base 2) or hexadecimal (base 16) number literals. + +- The prefix for binary integer literals is `0b`, e.g. `0b10101110`. +- The prefix for hexadecimal integer literals is `0x`, e.g. `0xabcdef02`. + +Binary and hexadecimal integer literals can only be used for unsigned integers. +The maximum supported value for binary and hexadecimal numeric literals is +232 - 1, i.e. `0b11111111111111111111111111111111` (binary) or +`0xffffffff` (hexadecimal). + +### String literals + +String literals must be enclosed in single or double quotes. If the used quote +character is to be used itself within the string literal, it must be escaped +using the backslash symbol. A literal backslash also needs to be escaped with +a backslash. + +```aql +"yikes!" +"don't know" +"this is a \"quoted\" word" +"this is a longer string." +"the path separator on Windows is \\" + +'yikes!' +'don\'t know' +'this is a "quoted" word' +'this is a longer string.' +'the path separator on Windows is \\' +``` + +All string literals must be UTF-8 encoded. It is currently not possible to use +arbitrary binary data if it is not UTF-8 encoded. A workaround to use binary +data is to encode the data using [Base64](https://en.wikipedia.org/wiki/Base64) +or other algorithms on the application +side before storing, and decoding it on application side after retrieval. + +## Compound types + +AQL supports two compound types: + +- **array**: A composition of unnamed values, each accessible + by their positions. Sometimes called *list*. +- **object**: A composition of named values, each accessible + by their names. A *document* is an object at the top level. + +### Arrays / Lists + +The first supported compound type is the array type. Arrays are effectively +sequences of (unnamed / anonymous) values. Individual array elements can be +accessed by their positions. The order of elements in an array is important. + +An *array declaration* starts with a left square bracket `[` and ends with +a right square bracket `]`. The declaration contains zero, one or more +*expression*s, separated from each other with the comma `,` symbol. +Whitespace around elements is ignored in the declaration, thus line breaks, +tab stops and blanks can be used for formatting. + +In the easiest case, an array is empty and thus looks like: + +```json +[ ] +``` + +Array elements can be any legal *expression* values. Nesting of arrays is +supported. + +```json +[ true ] +[ 1, 2, 3 ] +[ -99, "yikes!", [ false, ["no"], [] ], 1 ] +[ [ "fox", "marshal" ] ] +``` + +A trailing comma after the last element is allowed: + +```aql +[ + 1, + 2, + 3, // trailing comma +] +``` + +Individual array values can later be accessed by their positions using the `[]` +accessor: + +```aql +u.friends[0] // access 1st array element +u.friends[-1] // access last array element +``` + +For more details about this array operator, see +[Indexed value access](../operators.md#indexed-value-access). + +### Objects / Documents + +The other supported compound type is the object (or document) type. Objects are a +composition of zero to many attributes. Each attribute is a name/value pair. +Object attributes can be accessed individually by their names. This data type is +also known as dictionary, map, associative array and other names. + +Object declarations start with a left curly bracket `{` and end with a +right curly bracket `}`. An object contains zero to many attribute declarations, +separated from each other with the `,` symbol. Whitespace around elements is ignored +in the declaration, thus line breaks, tab stops and blanks can be used for formatting. + +In the simplest case, an object is empty. Its declaration would then be: + +```json +{ } +``` + +Each attribute in an object is a name/value pair. Name and value of an +attribute are separated using the colon `:` symbol. The name is always a string, +whereas the value can be of any type including sub-objects. + +The attribute name is mandatory - there can't be anonymous values in an object. +It can be specified as a quoted or unquoted string: + +```aql +{ name: … } // unquoted +{ 'name': … } // quoted (apostrophe / "single quote mark") +{ "name": … } // quoted (quotation mark / "double quote mark") +``` + +It must be quoted if it contains whitespace, escape sequences or characters +other than ASCII letters (`a`-`z`, `A`-`Z`), digits (`0`-`9`), +underscores (`_`) and dollar signs (`$`). The first character has to be a +letter, underscore or dollar sign. + +If a [keyword](syntax.md#keywords) is used as an attribute name, +then the attribute name must be quoted: + +```aql +{ return: … } // error, return is a keyword! +{ 'return': … } // quoted string literal (single quote marks) +{ "return": … } // quoted string literal (double quote marks) +{ `return`: … } // quoted name (backticks) +{ ´return´: … } // quoted name (forward ticks) +``` + +A trailing comma after the last element is allowed: + +```aql +{ + "a": 1, + "b": 2, + "c": 3, // trailing comma +} +``` + +Attribute names can be computed using dynamic expressions, too. +To disambiguate regular attribute names from attribute name expressions, +computed attribute names must be enclosed in square brackets `[ … ]`: + +```aql +{ [ CONCAT("test/", "bar") ] : "someValue" } +``` + +There is also shorthand notation for attributes which is handy for +returning existing variables easily: + +```aql +LET name = "Peter" +LET age = 42 +RETURN { name, age } +``` + +The above is the shorthand equivalent for the generic form: + +```aql +LET name = "Peter" +LET age = 42 +RETURN { name: name, age: age } +``` + +Any valid expression can be used as an attribute value. That also means nested +objects can be used as attribute values: + +```aql +{ name : "Peter" } +{ "name" : "Vanessa", "age" : 15 } +{ "name" : "John", likes : [ "Swimming", "Skiing" ], "address" : { "street" : "Cucumber lane", "zip" : "94242" } } +``` + +Individual object attributes can later be accessed by their names using the +dot `.` accessor: + +```aql +u.address.city.name +u.friends[0].name.first +``` + +Attributes can also be accessed using the square bracket `[]` accessor. +In contrast to the dot accessor, the square brackets allow for expressions. +Note that the accessor for array elements also uses square brackets: + +```aql +u["address"]["city"]["name"] +u["friends"][0]["name"]["first"] + +LET attr1 = "friends" +LET attr2 = "name" +u[attr1][0][attr2][ CONCAT("fir", "st") ] +``` + +For more details about these object operators, see +[Attribute access](../operators.md#attribute-access). diff --git a/site/content/arangodb/oem/aql/fundamentals/limitations.md b/site/content/arangodb/oem/aql/fundamentals/limitations.md new file mode 100644 index 0000000000..1eeb785116 --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/limitations.md @@ -0,0 +1,104 @@ +--- +title: Known limitations for AQL queries +menuTitle: Limitations +weight: 45 +description: >- + AQL has restrictions with regards to the complexity of queries and the data + they operate on, as well as design limitations to be aware of +--- +## Complexity limitations + +The following hard-coded limitations exist for AQL queries: + +- An AQL query cannot use more than _1000_ result registers. + One result register is needed for every named query variable and for + internal/anonymous query variables, e.g. for intermediate results. + Subqueries also require result registers. +- An AQL query cannot have more than _4000_ execution nodes in its initial + query execution plan. This number includes all execution nodes of the + initial execution plan, even if some of them could be + optimized away later by the query optimizer during plan optimization. +- An AQL query cannot use more than _2048_ collections/shards. + {{< tip >}} + From version 3.10.7 onward, this limit is configurable via the + `--query.max-collections-per-query` startup option. + {{< /tip >}} +- Expressions in AQL queries cannot have a nesting of more than _500_ levels. + As an example, the expression `1 + 2 + 3 + 4` is 3 levels deep + (because it is interpreted and executed as `1 + (2 + (3 + 4))`). +- When reading any data from JSON or VelocyPack input or when serializing + any data to JSON or VelocyPack, there is a maximum recursion depth for + nested arrays and objects, which is slightly below 200. Arrays or objects + with higher nesting than this cause `Too deep nesting in Array/Object` + exceptions. + +Please note that even queries that are still below these limits may not +yield good performance, especially when they have to put together data from lots +of different collections. Please also consider that large queries (in terms of +intermediate result size or final result size) can use considerable amounts of +memory and may hit the configurable memory limits for AQL queries. + +## Design limitations + +The following design limitations are known for AQL queries: + +- Subqueries that are used inside expressions are pulled out of these + expressions and executed beforehand. That means that subqueries do not + participate in lazy evaluation of operands, for example in the + [ternary operator](../operators.md#ternary-operator). Also see + [evaluation of subqueries](subqueries.md#evaluation-of-subqueries). +- It is not possible to use a collection in a read operation after + it was used for a write operation in the same AQL query. +- In the cluster, all collections that are accessed **dynamically** by + [traversals working with collection sets](../graphs/traversals.md#working-with-collection-sets) + (instead of named graphs) must be stated in the query's initial + [`WITH` statement](../high-level-operations/with.md). To make the `WITH` statement + required in single server as well (e.g. for testing a migration to cluster), + please start the server with the option `--query.require-with`. + +## Storage engine properties + +{{< info >}} +The following restrictions and limitations do not apply to JavaScript Transactions +and Stream Transactions, including AQL queries that run inside such transactions. +Their intended use case is for smaller transactions with full transactional +guarantees. So the following only applies to standalone AQL queries. +{{< /info >}} + +Data of ongoing transactions is stored in RAM. Transactions that get too big +(in terms of number of operations involved or the total size of data created or +modified by the transaction) are committed automatically. Effectively, this +means that big user transactions are split into multiple smaller RocksDB +transactions that are committed individually. The entire user transaction does +not necessarily have ACID properties in this case. + +The following startup options can be used to control the RAM usage and automatic +intermediate commits for the RocksDB engine: + +- `--rocksdb.max-transaction-size` + + Transaction size limit (in bytes). Transactions store all keys and values in + RAM, so large transactions run the risk of causing out-of-memory situations. + This setting allows you to ensure that does not happen by limiting the size of + any individual transaction. Transactions whose operations would consume more + RAM than this threshold value will abort automatically with error 32 ("resource + limit exceeded"). + +- `--rocksdb.intermediate-commit-size` + + If the size of all operations in a transaction reaches this threshold, the transaction + is committed automatically and a new transaction is started. The value is specified in bytes. + +- `--rocksdb.intermediate-commit-count` + + If the number of operations in a transaction reaches this value, the transaction is + committed automatically and a new transaction is started. + +The above values can also be adjusted per query, for example, by setting the +following attributes in the call to `db._query()` in the JavaScript API: + +- `maxTransactionSize`: transaction size limit in bytes +- `intermediateCommitSize`: maximum total size of operations after which an intermediate + commit is performed automatically +- `intermediateCommitCount`: maximum number of operations after which an intermediate + commit is performed automatically diff --git a/site/content/arangodb/oem/aql/fundamentals/query-errors.md b/site/content/arangodb/oem/aql/fundamentals/query-errors.md new file mode 100644 index 0000000000..d1b7f507fa --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/query-errors.md @@ -0,0 +1,41 @@ +--- +title: AQL query errors +menuTitle: Query Errors +weight: 40 +description: >- + Errors can occur for queries at compile time, like for syntax errors and + missing collections, but warnings and errors can also occur during query + execution +--- +Issuing an invalid query to the server results in a parse error if the query +is syntactically invalid. ArangoDB detects such errors during query +inspection and aborts further processing. The error number and an error +message are returned so that you can fix the errors. + +If a query passes the parsing stage, all collections explicitly referenced in +the query are known. If any of these collections doesn't exist, the query execution +is aborted and an appropriate error message is returned. + +Under some circumstances, executing a query may also produce errors or warnings +at runtime. This cannot be predicted from inspecting the query text alone. +This is because query operations can be data-dependent or are only evaluated +during the query execution, like looking up documents dynamically or using +document attributes that not all documents of the collection have. This can +subsequently lead to errors or warnings if these cases are not accounted for. + +Some examples of runtime errors: + +- **Division by zero**: Raised when an attempt is made to use the value + `0` as the divisor in an arithmetic division or modulus operation +- **Invalid operands for arithmetic operations**: Raised when an attempt + is made to use any non-numeric values as operands in arithmetic operations. + This includes unary (unary minus, unary plus) and binary operations (plus, + minus, multiplication, division, and modulus) +- **Array expected in query**: Raised when a non-array operand is used for an + operation that expects an array argument operand. This can happen if you + try to iterate over an attribute with a `FOR` operation, expecting it to be an + array, but if the attribute doesn't exist, then it has a value of `null` which + cannot be looped over. + +See the [Error codes and meanings](../../develop/error-codes.md) +for a complete list of ArangoDB errors. diff --git a/site/content/arangodb/oem/aql/fundamentals/query-results.md b/site/content/arangodb/oem/aql/fundamentals/query-results.md new file mode 100644 index 0000000000..bbee073926 --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/query-results.md @@ -0,0 +1,114 @@ +--- +title: AQL query results +menuTitle: Query Results +weight: 35 +description: >- + The result set of an AQL query is always an array of values, even if it + returns a single element only +--- +AQL queries and also [subqueries](subqueries.md) each produce an array with zero +or more elements. + +An empty array typically means that no (matching) data was found to act upon, or +that a write query didn't specify anything to return. + +```aql +FOR doc IN emptyCollection + RETURN doc // no documents +``` + +``` +FOR u IN users + FILTER age == -1 // no matches + RETURN u +``` + +```aql +UPDATE { id: 2, active: true } IN users +// no RETURN operation +``` + +The result set of the above examples is empty: + +```json +[ ] +``` + +If there is a single result, you get an array with one element back, not the +result value only. + + +```aql +FOR u IN users + LIMIT 1 + RETURN u.name +``` + +```json +[ "John" ] +``` + +If there are multiple results, you get an array with many elements back. + +```aql +FOR u IN users + RETURN u.name +``` + +```json +[ + "John", + "Vanessa", + "Amy" +] +``` + +The individual values in the result array of a query may or may not have a +homogeneous structure, depending on what is actually queried. + +For example, the individual documents of a collection can use different sets of +attribute names. When returning data from a collection with inhomogeneous +documents without modification, the result values have an inhomogeneous structure, +too. Each result value itself is a document: + +```aql +FOR u IN users + RETURN u +``` + +```json +[ + { "id": 1, "name": "John", "active": false }, + { "age": 32, "id": 2, "name": "Vanessa" }, + { "friends": [ "John", "Vanessa" ], "id": 3, "name": "Amy" } +] +``` + +However, if a fixed set of attributes from the collection is queried, then the +query result values have a homogeneous structure. Each result value is +still (a projection of) a document: + +```aql +FOR u IN users + RETURN { "id": u.id, "name": u.name } +``` + +```json +[ + { "id": 1, "name": "John" }, + { "id": 2, "name": "Vanessa" }, + { "id": 3, "name": "Amy" } +] +``` + +It is also possible to query scalar values only. In this case, the result set +is an array of scalars, and each result value is a scalar value: + +```aql +FOR u IN users + RETURN u.id +``` + +```json +[ 1, 2, 3 ] +``` diff --git a/site/content/arangodb/oem/aql/fundamentals/subqueries.md b/site/content/arangodb/oem/aql/fundamentals/subqueries.md new file mode 100644 index 0000000000..2efef66361 --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/subqueries.md @@ -0,0 +1,188 @@ +--- +title: Combining queries with subqueries in AQL +menuTitle: Subqueries +weight: 30 +description: >- + Subqueries let you form complex requests and allow you to process more data in + with a single query +--- +## How to use subqueries + +Wherever an expression is allowed in AQL, a subquery can be placed. A subquery +is a query part that can introduce its own local variables without affecting +variables and values in its outer scope(s). + +It is required that subqueries be put inside parentheses `(` and `)` to +explicitly mark their start and end points: + +```aql +FOR p IN persons + LET recommendations = ( // subquery start + FOR r IN recommendations + FILTER p.id == r.personId + SORT p.rank DESC + LIMIT 10 + RETURN r + ) // subquery end + RETURN { person : p, recommendations : recommendations } +``` + +A subquery's result can be assigned to a variable with +[`LET`](../high-level-operations/let.md) as shown above, so that it can be referenced +multiple times or just to improve the query readability. + +Function calls also use parentheses and AQL allows you to omit an extra pair if +you want to use a subquery as sole argument for a function, e.g. +`MAX()` instead of `MAX(())`: + +```aql +FOR p IN persons + COLLECT city = p.city INTO g + RETURN { + city : city, + numPersons : LENGTH(g), + maxRating: MAX( // subquery start + FOR r IN g + RETURN r.p.rating + ) // subquery end + } +``` + +The extra wrapping is required if there is more than one function argument, +however, e.g. `NOT_NULL((RETURN "ok"), "fallback")`. + +Subqueries may also include other subqueries. + +## Subquery results and unwinding + +Subqueries always return a result **array**, even if there is only +a single return value: + +```aql +RETURN ( RETURN 1 ) +``` + +```json +[ [ 1 ] ] +``` + +To avoid such a nested data structure, [`FIRST()`](../functions/array.md#first) +can be used for example: + +```aql +RETURN FIRST( RETURN 1 ) +``` + +```json +[ 1 ] +``` + +To unwind the result array of a subquery so that each element is returned as +top-level element in the overall query result, you can use a `FOR` loop: + +```aql +FOR elem IN (RETURN 1..3) // [1,2,3] + RETURN elem +``` + +```json +[ + 1, + 2, + 3 +] +``` + +Without unwinding, the query would be `RETURN (RETURN 1..3)` and the result +a nested array `[ [ 1, 2, 3 ] ]` with a single top-level element. + +## Evaluation of subqueries + +Subqueries that are used inside expressions are pulled out of these +expressions and executed beforehand. That means that subqueries do not +participate in lazy evaluation of operands, for example in the +[ternary operator](../operators.md#ternary-operator). + +Consider the following query: + +```aql +RETURN RAND() > 0.5 ? (RETURN 1) : 0 +``` + +It get transformed into something more like this, with the calculation of the +subquery happening before the evaluation of the condition: + +```aql +LET temp1 = (RETURN 1) +LET temp2 = RAND() > 0.5 ? temp1 : 0 +RETURN temp2 +``` + +The subquery is executed regardless of the condition. In other words, there is +no short-circuiting that would avoid the subquery from running in the case that +the condition evaluates to `false`. You may need to take this into account to +avoid query errors like + +> Query: AQL: collection or array expected as operand to FOR loop; you provided +> a value of type 'null' (while executing) + +```aql +LET maybe = DOCUMENT("coll/does_not_exist") +LET dependent = maybe ? ( + FOR attr IN ATTRIBUTES(maybe) + RETURN attr +) : "document not found" +RETURN dependent +``` + +The problem is that the subquery is executed under all circumstances, despite +the check whether `DOCUMENT()` found a document or not. It does not take into +account that `maybe` can be `null`, which cannot be iterated over with `FOR`. +A possible solution is to fall back to an empty array in the subquery to +effectively prevent the loop body from being run: + +```aql +LET maybe = DOCUMENT("coll/does_not_exist") +LET dependent = maybe ? ( + FOR attr IN NOT_NULL(ATTRIBUTES(maybe || {}), []) + RETURN attr +) : "document not found" +RETURN dependent +``` + +The additional fallback `maybe || {}` prevents a query warning + +> invalid argument type in call to function 'ATTRIBUTES()' + +that originates from a `null` value getting passed to the `ATTRIBUTES()` +function that expects an object. + +Similarly, when you use subqueries as sub-expressions that are combined with +logical `AND` or `OR`, the subqueries are always executed: + +```aql +RETURN false AND (RETURN ASSERT(false, "executed")) +``` + +```aql +RETURN true OR (RETURN ASSERT(false, "executed")) +``` + +If the first operand of a logical `AND` is `false`, the overall result is +`false` regardless of the second operand. If the first operand of a logical `OR` +is `true`, the overall result is `true` regardless of the second operand. +However, the subqueries are run nonetheless, causing both example queries to fail. + +You can prevent the subqueries from executing by prepending a `FILTER` operation +with the value of the logical operator's first operand and negating it in case +of an `OR`: + +```aql +LET cond = false +RETURN cond AND (FILTER cond RETURN ASSERT(false, "executed")) +``` + +```aql +LET cond = true +RETURN cond OR (FILTER !cond RETURN ASSERT(false, "executed")) +``` diff --git a/site/content/arangodb/oem/aql/fundamentals/syntax.md b/site/content/arangodb/oem/aql/fundamentals/syntax.md new file mode 100644 index 0000000000..dd2177a519 --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/syntax.md @@ -0,0 +1,347 @@ +--- +title: AQL Syntax +menuTitle: Syntax +weight: 5 +description: >- + Query types, whitespace, comments, keywords, and names in the AQL language + explained +--- +## Query types + +An AQL query must either return a result (indicated by usage of the `RETURN` +keyword) or execute a data-modification operation (indicated by usage +of one of the keywords `INSERT`, `UPDATE`, `REPLACE`, `REMOVE` or `UPSERT`). The AQL +parser will return an error if it detects more than one data-modification +operation in the same query or if it cannot figure out if the query is meant +to be a data retrieval or a modification operation. + +AQL only allows **one** query in a single query string; thus semicolons to +indicate the end of one query and separate multiple queries (as seen in SQL) are +not allowed. + +## Whitespace + +Whitespace (blanks, carriage returns, line feeds, and tab stops) can be used +in the query text to increase its readability. Tokens have to be separated by +any number of whitespace. Whitespace within strings or names must be enclosed +in quotes in order to be preserved. + +## Comments + +Comments can be embedded at any position in a query. The text contained in the +comment is ignored by the AQL parser. + +Multi-line comments cannot be nested, which means subsequent comment starts within +comments are ignored, comment ends will end the comment. + +AQL supports two types of comments: + +- Single line comments: These start with a double forward slash and end at + the end of the line, or the end of the query string (whichever is first). +- Multi line comments: These start with a forward slash and asterisk, and + end with an asterisk and a following forward slash. They can span as many + lines as necessary. + +```aql +/* this is a comment */ RETURN 1 +/* these */ RETURN /* are */ 1 /* multiple */ + /* comments */ 1 +/* this is + a multi line + comment */ +// a single line comment +``` + +## Keywords + +On the top level, AQL offers the following +[high-level operations](../high-level-operations/_index.md): + +| Operation | Description +|:----------|:----------- +| `FOR` | Array iteration +| `RETURN` | Results projection +| `FILTER` | Non-View results filtering +| `SEARCH` | View results filtering +| `SORT` | Result sorting +| `LIMIT` | Result slicing +| `LET` | Variable assignment +| `COLLECT` | Result grouping +| `WINDOW` | Aggregations over related rows +| `INSERT` | Insertion of new documents +| `UPDATE` | (Partial) update of existing documents +| `REPLACE` | Replacement of existing documents +| `REMOVE` | Removal of existing documents +| `UPSERT` | Insertion of new or update of existing documents +| `WITH` | Collection declaration + +Each of the above operations can be initiated in a query by using a keyword of +the same name. An AQL query can (and typically does) consist of multiple of the +above operations. + +An example AQL query may look like this: + +```aql +FOR u IN users + FILTER u.type == "newbie" && u.active == true + RETURN u.name +``` + +In this example query, the terms `FOR`, `FILTER`, and `RETURN` initiate the +higher-level operation according to their name. These terms are also keywords, +meaning that they have a special meaning in the language. + +For example, the query parser will use the keywords to find out which high-level +operations to execute. That also means keywords can only be used at certain +locations in a query. This also makes all keywords **reserved words** that must +not be used for other purposes than they are intended for. + +For example, it is not possible to use a keyword as literal unquoted string +(identifier) for a collection or attribute name. If a collection or attribute +needs to have the same name as a keyword, then the collection or attribute name +needs to be quoted in the query (also see [Names](#names)). + +Keywords are case-insensitive, meaning they can be specified in lower, upper, or +mixed case in queries. In this documentation, all keywords are written in upper +case to make them distinguishable from other query parts. + +There are a few more keywords in addition to the higher-level operation keywords. +Additional keywords may be added in future versions of ArangoDB. +The complete list of keywords is currently: + +- `AGGREGATE` +- `ALL` +- `ALL_SHORTEST_PATHS` +- `AND` +- `ANY` +- `ASC` +- `COLLECT` +- `DESC` +- `DISTINCT` +- `FALSE` +- `FILTER` +- `FOR` +- `GRAPH` +- `IN` +- `INBOUND` +- `INSERT` +- `INTO` +- `K_PATHS` +- `K_SHORTEST_PATHS` +- `LET` +- `LIKE` +- `LIMIT` +- `NONE` +- `NOT` +- `NULL` +- `OR` +- `OUTBOUND` +- `REMOVE` +- `REPLACE` +- `RETURN` +- `SHORTEST_PATH` +- `SORT` +- `TRUE` +- `UPDATE` +- `UPSERT` +- `WINDOW` +- `WITH` +{.columns-3} + +On top of that, there are a few words used in language constructs which are not +reserved keywords. You can use them as collection or attribute names +without having to quote them. The query parser can identify them as keyword-like +based on the context: + +- `KEEP` – + [COLLECT](../high-level-operations/collect.md#discarding-obsolete-variables) + operation variant +- `COUNT` – + [COLLECT](../high-level-operations/collect.md#group-length-calculation) + operation variant (`WITH COUNT INTO`) +- `OPTIONS` – + [FOR](../high-level-operations/for.md#options) / + [SEARCH](../high-level-operations/search.md#search-options) / + [COLLECT](../high-level-operations/collect.md#collect-options) / + [INSERT](../high-level-operations/insert.md#query-options) / + [UPDATE](../high-level-operations/update.md#query-options) / + [REPLACE](../high-level-operations/replace.md#query-options) / + [UPSERT](../high-level-operations/upsert.md#query-options) / + [REMOVE](../high-level-operations/remove.md#query-options) operation / + [Graph Traversal](../graphs/traversals.md) / + [Shortest Path](../graphs/shortest-path.md#path-search-options) / + [k Shortest Paths](../graphs/k-shortest-paths.md#path-search-options) / +- `PRUNE` – + [Graph Traversal](../graphs/traversals.md#pruning) (`FOR` operation variant) +- `SEARCH` – + [SEARCH](../high-level-operations/search.md) operation +- `TO` – + [Shortest Path](../graphs/shortest-path.md) / + [All Shortest Paths](../graphs/all-shortest-paths.md) / + [k Shortest Paths](../graphs/k-shortest-paths.md) / + [k Paths](../graphs/k-paths.md) + +Last but not least, there are special variables which are available in certain +contexts. Unlike keywords, they are **case-sensitive**: + +- `CURRENT` – + available in + [array inline expressions](../operators.md#inline-expressions) and the + [question mark operator](../operators.md#question-mark-operator) +- `NEW` – + available after + [INSERT](../high-level-operations/insert.md#returning-the-inserted-documents) / + [UPDATE](../high-level-operations/update.md#returning-the-modified-documents) / + [REPLACE](../high-level-operations/replace.md#returning-the-modified-documents) / + [UPSERT](../high-level-operations/upsert.md#returning-documents) + operation +- `OLD` – + available after + [UPDATE](../high-level-operations/update.md#returning-the-modified-documents) / + [REPLACE](../high-level-operations/replace.md#returning-the-modified-documents) / + [UPSERT](../high-level-operations/upsert.md#returning-documents) / + [REMOVE](../high-level-operations/remove.md#returning-the-removed-documents) + operation + +If you define a variable with the same name in the same scope, then its value +will be and remain at what you set it to. Hence you need to avoid these names +for your own variables if you want to access the special variable values. + +## Names + +In general, names are used to identify the following things in AQL queries: +- collections +- attributes +- variables +- functions + +Names in AQL are always case-sensitive. +The maximum supported length for collection/View names is 256 bytes. +Variable names can be longer, but are discouraged. + +Keywords should not be used as names. If you want to use a reserved keyword as +name anyway, the name must be enclosed in backticks or forward ticks. This is referred to as _quoting_. + +```aql +FOR doc IN `filter` + RETURN doc.`sort` +``` + +Due to the backticks, `filter` and `sort` are interpreted as names and not as +keywords here. + +You can also use forward ticks: + +```aql +FOR f IN ´filter´ + RETURN f.´sort´ +``` + +Instead of ticks, you may use the bracket notation for the attribute access: + +```aql +FOR f IN `filter` + RETURN f["sort"] +``` + +`sort` is a string literal in quote marks in this alternative and does thus not +conflict with the reserved keyword. + +Quoting with ticks is also required if certain characters such as +hyphen minus (`-`) are contained in a name, namely if they are used for +[operators](../operators.md) in AQL: + +```aql +LET `my-var` = 42 +``` + +### Collection names + +You can typically use collection names in queries as they are. If a collection +happens to have the same name as a keyword, the name must be enclosed in +backticks or forward ticks. + +Quoting with ticks is also required if special characters such as +hyphen minus (`-`) are contained in a collection name: + +```aql +FOR doc IN `my-coll` + RETURN doc +``` + +The collection `my-coll` has a dash in its name, but `-` is an arithmetic +operator for subtraction in AQL. The backticks quote the collection name to +refer to the collection correctly. + +If you use extended collection and View names +([`--database.extended-names` startup option](../../components/arangodb-server/options.md#--databaseextended-names)), +they may contain spaces, or non-ASCII characters such as Japanese or Arabic +letters, emojis, letters with accentuation, and other UTF-8 characters. +Quoting is required in these cases, too: + +```aql +FOR doc IN ´🥑~колекція =)´ + RETURN doc +``` + +The collection name contains characters that are allowed using the extended +naming constraints and is quoted with forward ticks. + +Note that quoting the name with `"` or `'` is not possible for collections as +they cannot be string literals in quote marks. + +For information about the naming constraints for collections, see +[Collection names](../../concepts/data-structure/collections.md#collection-names). + +### Attribute names + +When referring to attributes of documents from a collection, the fully qualified +attribute name must be used. This is because multiple collections with ambiguous +attribute names may be used in a query. To avoid any ambiguity, it is not +allowed to refer to an unqualified attribute name. + +Also see the naming restrictions for +[Attribute names](../../concepts/data-structure/documents/_index.md#attribute-names). + +```aql +FOR u IN users + FOR f IN friends + FILTER u.active == true && f.active == true && u.id == f.userId + RETURN u.name +``` + +In the above example, the attribute names `active`, `name`, `id`, and `userId` +are qualified using the collection names they belong to (`u` and `f` +respectively). + +### Variable names + +AQL allows you to assign values to additional variables in a query. +All variables that are assigned a value must have a name that is unique within +the context of the query. + +```aql +FOR u IN users + LET friends = u.friends + RETURN { "name" : u.name, "friends" : friends } +``` + +In the above query, `users` is a collection name, and both `u` and `friends` are +variable names. This is because the `FOR` and `LET` operations need target +variables to store their intermediate results. + +Variable names should be different from the names of any collection name used in +the same query to avoid shadowing, which can render a collection with the same +name inaccessible in the query after the variable assignment: + +```aql +LET users = [] +FOR u IN users // iterates over the "users" variable, not the "users" collection + RETURN u +``` + +Allowed characters in variable names are the letters `a` to `z` (both in lower +and upper case), the numbers `0` to `9`, the underscore (`_`) symbol and the +dollar (`$`) sign. A variable name must not start with a number. If a variable +name starts with one or multiple underscore characters, the underscore(s) must +be followed by least one letter (a-z or A-Z). The dollar sign can only be used +as the very first character in a variable name and must be followed by a letter. diff --git a/site/content/arangodb/oem/aql/fundamentals/type-and-value-order.md b/site/content/arangodb/oem/aql/fundamentals/type-and-value-order.md new file mode 100644 index 0000000000..bfe5a3baee --- /dev/null +++ b/site/content/arangodb/oem/aql/fundamentals/type-and-value-order.md @@ -0,0 +1,137 @@ +--- +title: Type and value order in AQL +menuTitle: Type and value order +weight: 20 +description: >- + AQL uses a set of rules for equality checks and comparisons that takes both + the data types and the actual values into account +--- +When checking for equality or inequality, or when determining the sort order of +values, AQL uses a deterministic algorithm for the comparison. + +The compared operands are first compared by their data types, and only by their +data values if the operands have the same data types. + +The following type order is used when comparing data types: + +``` +null < bool < number < string < array (or list) < object (or document) +``` + +This means `null` is the smallest type in AQL and *object* is the type with +the highest order. If the compared operands have a different type, then the +comparison result is determined and the comparison is finished. + +For example, the boolean `true` value is always less than any numeric or +string value, any array (even an empty array), and any object. Additionally, any +string value (even an empty string) is always greater than any numeric +value and a boolean value (`true` and `false`). + +```aql +null < false +null < true +null < 0 +null < '' +null < ' ' +null < '0' +null < 'abc' +null < [ ] +null < { } + +false < true +false < 0 +false < '' +false < ' ' +false < '0' +false < 'abc' +false < [ ] +false < { } + +true < 0 +true < '' +true < ' ' +true < '0' +true < 'abc' +true < [ ] +true < { } + +0 < '' +0 < ' ' +0 < '0' +0 < 'abc' +0 < [ ] +0 < { } + +'' < ' ' +'' < '0' +'' < 'abc' +'' < [ ] +'' < { } + +[ ] < { } +``` + +If the two compared operands have the same data types, then the operands values +are compared. For the primitive types (null, boolean, number, and string), the +result is defined as follows: + +- **null**: `null` is equal to `null` +- **boolean**: `false` is less than `true` +- **number**: numeric values are ordered by their cardinal value +- **string**: string values are ordered using a localized comparison, using the configured + [server language](../../components/arangodb-server/options.md#--default-language) + for sorting according to the alphabetical order rules of that language + +Note: unlike in SQL, `null` can be compared to any value, including `null` +itself, without the result being converted into `null` automatically. + +For compound types (array and object), the following special rules are applied: + +Two **array** values are compared by comparing their individual elements position by +position, starting at the first element. For each position, the element types +are compared first. If the types are not equal, the comparison result is +determined, and the comparison is finished. If the types are equal, then the +values of the two elements are compared. If one of the arrays is finished and +the other array still has an element at a compared position, then `null` is +used as the element value of the fully traversed array. + +If an array element is itself a compound value (an array or an object), then the +comparison algorithm checks the element's sub-values recursively. The element's +sub-elements are compared recursively. + +```aql +[ ] < [ 0 ] +[ 1 ] < [ 2 ] +[ 1, 2 ] < [ 2 ] +[ 99, 99 ] < [ 100 ] +[ false ] < [ true ] +[ false, 1 ] < [ false, '' ] +``` + +Two **object** operands are compared by checking attribute names and value. The +attribute names are compared first. Before attribute names are compared, a +combined array of all attribute names from both operands is created and sorted +lexicographically. This means that the order in which attributes are declared +in an object is not relevant when comparing two objects. + +The combined and sorted array of attribute names is then traversed, and the +respective attributes from the two compared operands are then looked up. If one +of the objects does not have an attribute with the sought name, its attribute +value is considered to be `null`. Finally, the attribute value of both +objects is compared using the aforementioned data type and value comparison. +The comparisons are performed for all object attributes until there is an +unambiguous comparison result. If an unambiguous comparison result is found, the +comparison is finished. If there is no unambiguous comparison result, the two +compared objects are considered equal. + +```aql +{ } == { "a" : null } + +{ } < { "a" : 1 } +{ "a" : 1 } < { "a" : 2 } +{ "b" : 1 } < { "a" : 0 } +{ "a" : { "c" : true } } < { "a" : { "c" : 0 } } +{ "a" : { "c" : true, "a" : 0 } } < { "a" : { "c" : false, "a" : 1 } } + +{ "a" : 1, "b" : 2 } == { "b" : 2, "a" : 1 } +``` diff --git a/site/content/arangodb/oem/aql/graphs/_index.md b/site/content/arangodb/oem/aql/graphs/_index.md new file mode 100644 index 0000000000..ef7d7d79e1 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/_index.md @@ -0,0 +1,47 @@ +--- +title: Graphs in AQL +menuTitle: Graphs +weight: 35 +description: >- + You can perform graph traversals and path searches on named graphs as well as + collection sets with AQL +--- +There are multiple ways to work with [graphs in ArangoDB](../../graphs/_index.md), +as well as different ways to query your graphs using AQL. + +The two options in managing graphs are to either use + +- named graphs where ArangoDB manages the collections involved in one graph, or +- graph functions on a combination of document and edge collections. + +Named graphs can be defined through the [graph-module](../../graphs/general-graphs/_index.md) +or via the [web interface](../../components/web-interface/_index.md). +The definition contains the name of the graph, and the vertex and edge collections +involved. Since the management functions are layered on top of simple sets of +document and edge collections, you can also use regular AQL functions to work with them. + +Both variants (named graphs and loosely coupled collection sets a.k.a. anonymous graphs) +are supported by the AQL language constructs for graph querying. These constructs +make full use of optimizations and therefore best performance is to be expected: + +- [AQL Traversals](traversals.md) to follow edges connected to a start vertex, + up to a variable depth. It can be combined with AQL filter conditions. + +- [AQL Shortest Path](shortest-path.md) to find one shortest path + between two given documents. + +- [AQL All Shortest Paths](all-shortest-paths.md) to find all shortest + paths between two given documents. + +- [AQL k Shortest Paths](k-shortest-paths.md) to find the first *k* + paths in order of length (or weight) between two given documents. + +- [AQL k Paths](k-paths.md) to find all paths between two given documents. + +These types of queries are only useful if you use edge collections and/or graphs in +your data model. + +{{< info >}} +New to graphs? [Take our free graph course for freshers](https://www.arangodb.com/arangodb-graph-course/) +and get from zero knowledge to advanced query techniques. +{{< /info >}} diff --git a/site/content/arangodb/oem/aql/graphs/all-shortest-paths.md b/site/content/arangodb/oem/aql/graphs/all-shortest-paths.md new file mode 100644 index 0000000000..571a6857d3 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/all-shortest-paths.md @@ -0,0 +1,197 @@ +--- +title: All Shortest Paths in AQL +menuTitle: All Shortest Paths +weight: 20 +description: >- + Find all paths of shortest length between two vertices +--- +## General query idea + +This type of query finds all paths of shortest length between two given +documents (*startVertex* and *targetVertex*) in your graph. + +Every returned path is a JSON object with two attributes: + +- An array containing the `vertices` on the path. +- An array containing the `edges` on the path. + +**Example** + +A visual representation of the example graph: + +![Train Connection Map](../../../../images/train_map.png) + +Each ellipse stands for a train station with the name of the city written inside +of it. They are the vertices of the graph. Arrows represent train connections +between cities and are the edges of the graph. + +Assuming that you want to go from **Carlisle** to **London** by train, the +expected two shortest paths are: + +1. Carlisle – Birmingham – London +2. Carlisle – York – London + +Another path that connects Carlisle and London is +Carlisle – Glasgow – Edinburgh – York – London, but it has two more stops and +is therefore not a path of the shortest length. + +## Syntax + +The syntax for All Shortest Paths queries is similar to the one for +[Shortest Path](shortest-path.md) and there are also two options to +either use a named graph or a set of edge collections. It only emits a path +variable however, whereas `SHORTEST_PATH` emits a vertex and an edge variable. + +### Working with named graphs + +```aql +FOR path + IN OUTBOUND|INBOUND|ANY ALL_SHORTEST_PATHS + startVertex TO targetVertex + GRAPH graphName +``` + +- `FOR`: Emits the variable **path** which contains one shortest path as an + object, with the `vertices` and `edges` of the path. +- `IN` `OUTBOUND|INBOUND|ANY`: Defines in which direction + edges are followed (outgoing, incoming, or both) +- `ALL_SHORTEST_PATHS`: The keyword to compute All Shortest Paths +- **startVertex** `TO` **targetVertex** (both string\|object): The two vertices between + which the paths are computed. This can be specified in the form of + a ID string or in the form of a document with the attribute `_id`. All other + values result in a warning and an empty result. If one of the specified + documents does not exist, the result is empty as well and there is no warning. +- `GRAPH` **graphName** (string): The name identifying the named graph. Its vertex and + edge collections are looked up for the path search. + +{{< info >}} +All Shortest Paths traversals do not support edge weights. +{{< /info >}} + +### Working with collection sets + +```aql +FOR path + IN OUTBOUND|INBOUND|ANY ALL_SHORTEST_PATHS + startVertex TO targetVertex + edgeCollection1, ..., edgeCollectionN +``` + +Instead of `GRAPH graphName` you can specify a list of edge collections. +The involved vertex collections are determined by the edges of the given +edge collections. + +### Traversing in mixed directions + +For All Shortest Paths with a list of edge collections, you can optionally specify the +direction for some of the edge collections. Say, for example, you have three edge +collections *edges1*, *edges2* and *edges3*, where in *edges2* the direction +has no relevance, but in *edges1* and *edges3* the direction should be taken into +account. In this case you can use `OUTBOUND` as a general search direction and `ANY` +specifically for *edges2* as follows: + +```aql +FOR path IN OUTBOUND ALL_SHORTEST_PATHS + startVertex TO targetVertex + edges1, ANY edges2, edges3 +``` + +All collections in the list that do not specify their own direction use the +direction defined after `IN` (here: `OUTBOUND`). This allows using a different +direction for each collection in your path search. + +## Examples + +Load an example graph to get a named graph that reflects some possible +train connections in Europe and North America: + +![Train Connection Map](../../../../images/train_map.png) + +```js +--- +name: GRAPHASP_01_create_graph +description: '' +--- +~addIgnoreCollection("places"); +~addIgnoreCollection("connections"); +var examples = require("@arangodb/graph-examples/example-graph"); +var graph = examples.loadGraph("kShortestPathsGraph"); +db.places.toArray(); +db.connections.toArray(); +``` + +Suppose you want to query a route from **Carlisle** to **London**, and +compare the outputs of `SHORTEST_PATH`, `K_SHORTEST_PATHS` and `ALL_SHORTEST_PATHS`. +Note that `SHORTEST_PATH` returns any of the shortest paths, whereas +`ALL_SHORTEST_PATHS` returns all of them. `K_SHORTEST_PATHS` returns the +shortest paths first but continues with longer paths, until it found all routes +or reaches the defined limit (the number of paths). + +Using `SHORTEST_PATH` to get one shortest path: + +```aql +--- +name: GRAPHASP_01_Carlisle_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e IN OUTBOUND SHORTEST_PATH 'places/Carlisle' TO 'places/London' +GRAPH 'kShortestPathsGraph' + RETURN { place: v.label } +``` + +Using `ALL_SHORTEST_PATHS` to get both shortest paths: + +```aql +--- +name: GRAPHASP_02_Carlisle_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND ALL_SHORTEST_PATHS 'places/Carlisle' TO 'places/London' +GRAPH 'kShortestPathsGraph' + RETURN { places: p.vertices[*].label } +``` + +Using `K_SHORTEST_PATHS` without a limit to get all paths in order of +increasing length: + +```aql +--- +name: GRAPHASP_03_Carlisle_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND K_SHORTEST_PATHS 'places/Carlisle' TO 'places/London' +GRAPH 'kShortestPathsGraph' + RETURN { places: p.vertices[*].label } +``` + +If you ask for routes that don't exist, you get an empty result +(from **Carlisle** to **Toronto**): + +```aql +--- +name: GRAPHASP_04_Carlisle_to_Toronto +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND ALL_SHORTEST_PATHS 'places/Carlisle' TO 'places/Toronto' +GRAPH 'kShortestPathsGraph' + RETURN { + places: p.vertices[*].label + } +``` + +And finally clean up by removing the named graph: + +```js +--- +name: GRAPHASP_99_drop_graph +description: '' +--- +var examples = require("@arangodb/graph-examples/example-graph"); +examples.dropGraph("kShortestPathsGraph"); +~removeIgnoreCollection("places"); +~removeIgnoreCollection("connections"); +``` diff --git a/site/content/arangodb/oem/aql/graphs/k-paths.md b/site/content/arangodb/oem/aql/graphs/k-paths.md new file mode 100644 index 0000000000..e4da13c5e3 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/k-paths.md @@ -0,0 +1,232 @@ +--- +title: k Paths in AQL +menuTitle: k Paths +weight: 30 +description: >- + Find all paths between two vertices with a fixed range of path lengths +--- +## General query idea + +This type of query finds all paths between two given documents +(*startVertex* and *targetVertex*) in your graph. The paths are restricted +by a minimum and maximum length that you specify. + +Every such path is returned as a JSON object with two components: + +- an array containing the `vertices` on the path +- an array containing the `edges` on the path + +**Example** + +Here is an example graph to explain how the k Paths algorithm works: + +![Train Connection Map](../../../../images/train_map.png) + +Each ellipse stands for a train station with the name of the city written inside +of it. They are the vertices of the graph. Arrows represent train connections +between cities and are the edges of the graph. The numbers near the arrows +describe how long it takes to get from one station to another. They are used +as edge weights. + +Assume that you want to go from **Aberdeen** to **London** by train. + +You have a couple of alternatives: + +a) Straight way + + 1. Aberdeen + 2. Leuchars + 3. Edinburgh + 4. York + 5. London + +b) Detour at York + + 1. Aberdeen + 2. Leuchars + 3. Edinburgh + 4. York + 5. **Carlisle** + 6. **Birmingham** + 7. London + +c) Detour at Edinburgh + + 1. Aberdeen + 2. Leuchars + 3. Edinburgh + 4. **Glasgow** + 5. **Carlisle** + 6. **Birmingham** + 7. London + +d) Detour at Edinburgh to York + + 1. Aberdeen + 2. Leuchars + 3. Edinburgh + 4. **Glasgow** + 5. **Carlisle** + 6. York + 7. London + +Note that only paths that do not contain the same vertex twice are consider to +be valid. The following alternative would visit Aberdeen twice and is **not** +returned by the k Paths algorithm: + +1. Aberdeen +2. **Inverness** +3. **Aberdeen** +4. Leuchars +5. Edinburgh +6. York +7. London + +## Example Use Cases + +The use-cases for k Paths are about the same as for unweighted k Shortest Paths. +The main difference is that k Shortest Paths enumerates all paths with +**increasing length**. It stops as soon as a given number of paths is reached. +k Paths enumerates all paths within a given **range of path lengths** instead, +and is thereby upper-bounded. + +The k Paths traversal can be used as foundation for several other algorithms: + +- **Transportation** of any kind (e.g. road traffic, network package routing) +- **Flow problems**: You need to transfer items from A to B, which alternatives + do you have? What is their capacity? + +## Syntax + +The syntax for k Paths queries is similar to the one for +[K Shortest Path](k-shortest-paths.md) with the addition to define the +minimum and maximum length of the path. + +{{< warning >}} +It is highly recommended that you use a reasonable maximum path length or a +**LIMIT** statement, as k Paths is a potentially expensive operation. It can +return a large number of paths for large connected graphs. +{{< /warning >}} + +### Working with named graphs + +```aql +FOR path + IN MIN..MAX OUTBOUND|INBOUND|ANY K_PATHS + startVertex TO targetVertex + GRAPH graphName +``` + +- `FOR`: Emits the variable **path** which contains one path as an object + containing `vertices` and `edges` of the path. +- `IN` `MIN..MAX`: The minimal and maximal depth for the traversal: + - **min** (number, *optional*): Paths returned by this query + have at least a length of this many edges. + If not specified, it defaults to `1`. The minimal possible value is `0`. + - **max** (number, *optional*): Paths returned by this query + have at most a length of this many edges. + If omitted, it defaults to the value of `min`. Thus, only the vertices and + edges in the range of `min` are returned. You cannot specify `max` without `min`. +- `OUTBOUND|INBOUND|ANY`: Defines in which direction + edges are followed (outgoing, incoming, or both). +- `K_PATHS`: The keyword to compute all paths with the specified lengths. +- **startVertex** `TO` **targetVertex** (both string\|object): The two vertices + between which the paths are computed. This can be specified in the form of + a document identifier string or in the form of an object with the `_id` + attribute. All other values lead to a warning and an empty result. This is + also the case if one of the specified documents does not exist. +- `GRAPH` **graphName** (string): The name identifying the named graph. + Its vertex and edge collections are looked up for the path search. + +### Working with collection sets + +```aql +FOR path + IN MIN..MAX OUTBOUND|INBOUND|ANY K_PATHS + startVertex TO targetVertex + edgeCollection1, ..., edgeCollectionN + [OPTIONS options] +``` + +Instead of `GRAPH graphName` you can specify a list of edge collections. +The involved vertex collections are determined by the edges of the given +edge collections. + +### Traversing in mixed directions + +For k paths with a list of edge collections you can optionally specify the +direction for some of the edge collections. Say for example you have three edge +collections *edges1*, *edges2* and *edges3*, where in *edges2* the direction +has no relevance, but in *edges1* and *edges3* the direction should be taken +into account. In this case you can use `OUTBOUND` as general search direction +and `ANY` specifically for *edges2* as follows: + +```aql +FOR vertex IN OUTBOUND K_PATHS + startVertex TO targetVertex + edges1, ANY edges2, edges3 +``` + +All collections in the list that do not specify their own direction use the +direction defined after `IN` (here: `OUTBOUND`). This allows to use a different +direction for each collection in your path search. + +## Examples + +You can load the `kShortestPathsGraph` example graph to get a named graph that +reflects some possible train connections in Europe and North America. + +![Train Connection Map](../../../../images/train_map.png) + +```js +--- +name: GRAPHKP_01_create_graph +description: '' +--- +~addIgnoreCollection("places"); +~addIgnoreCollection("connections"); +var examples = require("@arangodb/graph-examples/example-graph"); +var graph = examples.loadGraph("kShortestPathsGraph"); +db.places.toArray(); +db.connections.toArray(); +``` + +Suppose you want to query all routes from **Aberdeen** to **London**. + +```aql +--- +name: GRAPHKP_01_Aberdeen_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN 1..10 OUTBOUND K_PATHS 'places/Aberdeen' TO 'places/London' +GRAPH 'kShortestPathsGraph' + RETURN { places: p.vertices[*].label, travelTimes: p.edges[*].travelTime } +``` + +If you ask for routes that don't exist, you get an empty result +(from **Aberdeen** to **Toronto**): + +```aql +--- +name: GRAPHKP_02_Aberdeen_to_Toronto +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN 1..10 OUTBOUND K_PATHS 'places/Aberdeen' TO 'places/Toronto' +GRAPH 'kShortestPathsGraph' + RETURN { places: p.vertices[*].label, travelTimes: p.edges[*].travelTime } +``` + +And finally clean up by removing the named graph: + +```js +--- +name: GRAPHKP_99_drop_graph +description: '' +--- +var examples = require("@arangodb/graph-examples/example-graph"); +examples.dropGraph("kShortestPathsGraph"); +~removeIgnoreCollection("places"); +~removeIgnoreCollection("connections"); +``` diff --git a/site/content/arangodb/oem/aql/graphs/k-shortest-paths.md b/site/content/arangodb/oem/aql/graphs/k-shortest-paths.md new file mode 100644 index 0000000000..917dba2516 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/k-shortest-paths.md @@ -0,0 +1,308 @@ +--- +title: k Shortest Paths in AQL +menuTitle: k Shortest Paths +weight: 25 +description: >- + Find a number of shortest paths in the order of increasing path length or weight +--- +## General query idea + +This type of query finds the first *k* paths in order of length +(or weight) between two given documents (*startVertex* and *targetVertex*) in +your graph. + +Every such path is returned as a JSON object with three components: + +- an array containing the `vertices` on the path +- an array containing the `edges` on the path +- the `weight` of the path, that is the sum of all edge weights + +If no `weightAttribute` is specified, the weight of the path is just its length. + +{{< youtube id="XdITulJFdVo" >}} + +**Example** + +Here is an example graph to explain how the k Shortest Paths algorithm works: + +![Train Connection Map](../../../../images/train_map.png) + +Each ellipse stands for a train station with the name of the city written inside +of it. They are the vertices of the graph. Arrows represent train connections +between cities and are the edges of the graph. The numbers near the arrows +describe how long it takes to get from one station to another. They are used +as edge weights. + +Let us assume that you want to go from **Aberdeen** to **London** by train. + +You expect to see the following vertices on *the* shortest path, in this order: + +1. Aberdeen +2. Leuchars +3. Edinburgh +4. York +5. London + +By the way, the weight of the path is: 1.5 + 1.5 + 3.5 + 1.8 = **8.3**. + +Let us look at alternative paths next, for example because you know that the +direct connection between York and London does not operate currently. +An alternative path, which is slightly longer, goes like this: + +1. Aberdeen +2. Leuchars +3. Edinburgh +4. York +5. **Carlisle** +6. **Birmingham** +7. London + +Its weight is: 1.5 + 1.5 + 3.5 + 2.0 + 1.5 = **10.0**. + +Another route goes via Glasgow. There are seven stations on the path as well, +however, it is quicker if you compare the edge weights: + +1. Aberdeen +2. Leuchars +3. Edinburgh +4. **Glasgow** +5. Carlisle +6. Birmingham +7. London + +The path weight is lower: 1.5 + 1.5 + 1.0 + 1.0 + 2.0 + 1.5 = **8.5**. + +## Syntax + +The syntax for k Shortest Paths queries is similar to the one for +[Shortest Path](shortest-path.md) and there are also two options to +either use a named graph or a set of edge collections. It only emits a path +variable however, whereas `SHORTEST_PATH` emits a vertex and an edge variable. + +{{< warning >}} +It is highly recommended that you use a **LIMIT** statement, as +k Shortest Paths is a potentially expensive operation. On large connected +graphs it can return a large number of paths, or perform an expensive +(but unsuccessful) search for more short paths. +{{< /warning >}} + +### Working with named graphs + +```aql +FOR path + IN OUTBOUND|INBOUND|ANY K_SHORTEST_PATHS + startVertex TO targetVertex + GRAPH graphName + [OPTIONS options] + [LIMIT offset, count] +``` + +- `FOR`: Emits the variable **path** which contains one path as an object containing + `vertices`, `edges`, and the `weight` of the path. +- `IN` `OUTBOUND|INBOUND|ANY`: Defines in which direction + edges are followed (outgoing, incoming, or both). +- `K_SHORTEST_PATHS`: The keyword to compute k Shortest Paths +- **startVertex** `TO` **targetVertex** (both string\|object): The two vertices between + which the paths are computed. This can be specified in the form of + a ID string or in the form of a document with the attribute `_id`. All other + values lead to a warning and an empty result. If one of the specified + documents does not exist, the result is empty as well and there is no warning. +- `GRAPH` **graphName** (string): The name identifying the named graph. Its vertex and + edge collections are looked up by the path search. +- `OPTIONS` **options** (object, *optional*): + See the [path search options](#path-search-options). +- `LIMIT` (see [LIMIT operation](../high-level-operations/limit.md), *optional*): + the maximal number of paths to return. It is highly recommended to use + a `LIMIT` for `K_SHORTEST_PATHS`. + +{{< info >}} +k Shortest Paths traversals do not support negative weights. If a document +attribute (as specified by `weightAttribute`) with a negative value is +encountered during traversal, or if `defaultWeight` is set to a negative +number, then the query is aborted with an error. +{{< /info >}} + +### Working with collection sets + +```aql +FOR path + IN OUTBOUND|INBOUND|ANY K_SHORTEST_PATHS + startVertex TO targetVertex + edgeCollection1, ..., edgeCollectionN + [OPTIONS options] + [LIMIT offset, count] +``` + +Instead of `GRAPH graphName` you can specify a list of edge collections. +The involved vertex collections are determined by the edges of the given +edge collections. + +### Path search options + +You can optionally specify the following options to modify the execution of a +graph path search. If you specify unknown options, query warnings are raised. + +#### `weightAttribute` + +A top-level edge attribute that should be used to read the edge weight (string). + +If the attribute does not exist or is not numeric, the `defaultWeight` is used +instead. + +The attribute value must not be negative. + +#### `defaultWeight` + +This value is used as fallback if there is no `weightAttribute` in the +edge document, or if it's not a number (number). + +The value must not be negative. The default is `1`. + +### Traversing in mixed directions + +For k shortest paths with a list of edge collections you can optionally specify the +direction for some of the edge collections. Say for example you have three edge +collections *edges1*, *edges2* and *edges3*, where in *edges2* the direction +has no relevance, but in *edges1* and *edges3* the direction should be taken into +account. In this case you can use `OUTBOUND` as general search direction and `ANY` +specifically for *edges2* as follows: + +```aql +FOR vertex IN OUTBOUND K_SHORTEST_PATHS + startVertex TO targetVertex + edges1, ANY edges2, edges3 +``` + +All collections in the list that do not specify their own direction use the +direction defined after `IN` (here: `OUTBOUND`). This allows to use a different +direction for each collection in your path search. + +## Examples + +You can load the `kShortestPathsGraph` example graph to get a named graph that +reflects some possible train connections in Europe and North America. + +![Train Connection Map](../../../../images/train_map.png) + +```js +--- +name: GRAPHKSP_01_create_graph +description: '' +--- +~addIgnoreCollection("places"); +~addIgnoreCollection("connections"); +var examples = require("@arangodb/graph-examples/example-graph"); +var graph = examples.loadGraph("kShortestPathsGraph"); +db.places.toArray(); +db.connections.toArray(); +``` + +Suppose you want to query a route from **Aberdeen** to **London**, and +compare the outputs of `SHORTEST_PATH` and `K_SHORTEST_PATHS` with +`LIMIT 1`. Note that while `SHORTEST_PATH` and `K_SHORTEST_PATH` with +`LIMIT 1` should return a path of the same length (or weight), they do +not need to return the same path. + +Using `SHORTEST_PATH`: + +```aql +--- +name: GRAPHKSP_01_Aberdeen_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e IN OUTBOUND SHORTEST_PATH 'places/Aberdeen' TO 'places/London' +GRAPH 'kShortestPathsGraph' + RETURN { place: v.label, travelTime: e.travelTime } +``` + +Using `K_SHORTEST_PATHS`: + +```aql +--- +name: GRAPHKSP_02_Aberdeen_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND K_SHORTEST_PATHS 'places/Aberdeen' TO 'places/London' +GRAPH 'kShortestPathsGraph' + LIMIT 1 + RETURN { places: p.vertices[*].label, travelTimes: p.edges[*].travelTime } +``` + +With `K_SHORTEST_PATHS`, you can ask for more than one option for a route: + +```aql +--- +name: GRAPHKSP_03_Aberdeen_to_London +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND K_SHORTEST_PATHS 'places/Aberdeen' TO 'places/London' +GRAPH 'kShortestPathsGraph' + LIMIT 3 + RETURN { + places: p.vertices[*].label, + travelTimes: p.edges[*].travelTime, + travelTimeTotal: SUM(p.edges[*].travelTime) + } +``` + +If you ask for routes that don't exist, you get an empty result +(from **Aberdeen** to **Toronto**): + +```aql +--- +name: GRAPHKSP_04_Aberdeen_to_Toronto +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND K_SHORTEST_PATHS 'places/Aberdeen' TO 'places/Toronto' +GRAPH 'kShortestPathsGraph' + LIMIT 3 + RETURN { + places: p.vertices[*].label, + travelTimes: p.edges[*].travelTime, + travelTimeTotal: SUM(p.edges[*].travelTime) + } +``` + +You can use the `travelTime` attribute that connections have as edge weights to +take into account which connections are quicker. A high default weight is set, +to be used if an edge has no `travelTime` attribute (not the case with the +example graph). This returns the top three routes with the fewest changes +and favoring the least travel time for the connection **Saint Andrews** +to **Cologne**: + +```aql +--- +name: GRAPHKSP_05_StAndrews_to_Cologne +description: '' +dataset: kShortestPathsGraph +--- +FOR p IN OUTBOUND K_SHORTEST_PATHS 'places/StAndrews' TO 'places/Cologne' +GRAPH 'kShortestPathsGraph' +OPTIONS { + weightAttribute: 'travelTime', + defaultWeight: 15 +} + LIMIT 3 + RETURN { + places: p.vertices[*].label, + travelTimes: p.edges[*].travelTime, + travelTimeTotal: SUM(p.edges[*].travelTime) + } +``` + +And finally clean up by removing the named graph: + +```js +--- +name: GRAPHKSP_99_drop_graph +description: '' +--- +var examples = require("@arangodb/graph-examples/example-graph"); +examples.dropGraph("kShortestPathsGraph"); +~removeIgnoreCollection("places"); +~removeIgnoreCollection("connections"); +``` diff --git a/site/content/arangodb/oem/aql/graphs/shortest-path.md b/site/content/arangodb/oem/aql/graphs/shortest-path.md new file mode 100644 index 0000000000..ed8540e777 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/shortest-path.md @@ -0,0 +1,228 @@ +--- +title: Shortest Path in AQL +menuTitle: Shortest Path +weight: 15 +description: >- + Find one path of shortest length between two vertices +--- +## General query idea + +This type of query finds the shortest path between two given documents +(*startVertex* and *targetVertex*) in your graph. If there are multiple +shortest paths, the path with the lowest weight or a random one (in case +of a tie) is returned. + +The shortest path search emits the following two variables for every step of +the path: + +1. The vertex on this path. +2. The edge pointing to it. + +### Example execution + +Let's take a look at a simple example to explain how it works. +This is the graph that you are going to find a shortest path on: + +![traversal graph](../../../../images/traversal_graph.png) + +You can use the following parameters for the query: + +1. You start at the vertex **A**. +2. You finish with the vertex **D**. + +So, obviously, you have the vertices **A**, **B**, **C** and **D** on the +shortest path in exactly this order. Then, the shortest path statement +returns the following pairs: + +| Vertex | Edge | +|--------|-------| +| A | null | +| B | A → B | +| C | B → C | +| D | C → D | + +Note that the first edge is always `null` because there is no edge pointing +to the *startVertex*. + +## Syntax + +The next step is to see how you can write a shortest path query. +You have two options here, you can either use a named graph or a set of edge +collections (anonymous graph). + +### Working with named graphs + +```aql +FOR vertex[, edge] + IN OUTBOUND|INBOUND|ANY SHORTEST_PATH + startVertex TO targetVertex + GRAPH graphName + [OPTIONS options] +``` + +- `FOR`: Emits up to two variables: + - **vertex** (object): The current vertex on the shortest path + - **edge** (object, *optional*): The edge pointing to the vertex +- `IN` `OUTBOUND|INBOUND|ANY`: Defines in which direction edges are followed + (outgoing, incoming, or both) +- **startVertex** `TO` **targetVertex** (both string\|object): The two vertices between + which the shortest path is computed. This can be specified in the form of + an ID string or in the form of a document with the attribute `_id`. All other + values lead to a warning and an empty result. If one of the specified + documents does not exist, the result is empty as well and there is no warning. +- `GRAPH` **graphName** (string): The name identifying the named graph. Its vertex and + edge collections are looked up for the path search. +- `OPTIONS` **options** (object, *optional*): + See the [path search options](#path-search-options). + +{{< info >}} +Shortest Path traversals do not support negative weights. If a document +attribute (as specified by `weightAttribute`) with a negative value is +encountered during traversal, or if `defaultWeight` is set to a negative +number, then the query is aborted with an error. +{{< /info >}} + +### Working with collection sets + +```aql +FOR vertex[, edge] + IN OUTBOUND|INBOUND|ANY SHORTEST_PATH + startVertex TO targetVertex + edgeCollection1, ..., edgeCollectionN + [OPTIONS options] +``` + +Instead of `GRAPH graphName` you may specify a list of edge collections (anonymous +graph). The involved vertex collections are determined by the edges of the given +edge collections. The rest of the behavior is similar to the named version. + +### Path search options + +You can optionally specify the following options to modify the execution of a +graph path search. If you specify unknown options, query warnings are raised. + +#### `weightAttribute` + +A top-level edge attribute that should be used to read the edge weight (string). + +If the attribute does not exist or is not numeric, the `defaultWeight` is used +instead. + +The attribute value must not be negative. + +#### `defaultWeight` + +This value is used as fallback if there is no `weightAttribute` in the +edge document, or if it's not a number (number). + +The value must not be negative. The default is `1`. + +### Traversing in mixed directions + +For shortest path with a list of edge collections you can optionally specify the +direction for some of the edge collections. Say for example you have three edge +collections *edges1*, *edges2* and *edges3*, where in *edges2* the direction +has no relevance, but in *edges1* and *edges3* the direction should be taken into +account. In this case you can use `OUTBOUND` as general search direction and `ANY` +specifically for *edges2* as follows: + +```aql +FOR vertex IN OUTBOUND SHORTEST_PATH + startVertex TO targetVertex + edges1, ANY edges2, edges3 +``` + +All collections in the list that do not specify their own direction use the +direction defined after `IN` (here: `OUTBOUND`). This allows to use a different +direction for each collection in your path search. + +## Conditional shortest path + +The `SHORTEST_PATH` computation only finds an unconditioned shortest path. +With this construct it is not possible to define a condition like: "Find the +shortest path where all edges are of type *X*". If you want to do this, use a +normal [Traversal](traversals.md) instead with the option +`{order: "bfs"}` in combination with `LIMIT 1`. + +Please also consider using [`WITH`](../high-level-operations/with.md) to specify the +collections you expect to be involved. + +## Examples + +Creating a simple symmetric traversal demonstration graph: + +![traversal graph](../../../../images/traversal_graph.png) + +```js +--- +name: GRAPHSP_01_create_graph +description: '' +--- +~addIgnoreCollection("circles"); +~addIgnoreCollection("edges"); +var examples = require("@arangodb/graph-examples/example-graph"); +var graph = examples.loadGraph("traversalGraph"); +db.circles.toArray(); +db.edges.toArray(); +``` + +Start with the shortest path from **A** to **D** as above: + +```js +--- +name: GRAPHSP_02_A_to_D +description: '' +--- +db._query(` + FOR v, e IN OUTBOUND SHORTEST_PATH 'circles/A' TO 'circles/D' GRAPH 'traversalGraph' + RETURN [v._key, e._key] +`); + +db._query(` + FOR v, e IN OUTBOUND SHORTEST_PATH 'circles/A' TO 'circles/D' edges + RETURN [v._key, e._key] +`); +``` + +You can see that expectations are fulfilled. You find the vertices in the +correct ordering and the first edge is `null`, because no edge is pointing +to the start vertex on this path. + +You can also compute shortest paths based on documents found in collections: + +```js +--- +name: GRAPHSP_03_A_to_D +description: '' +--- +db._query(` + FOR a IN circles + FILTER a._key == 'A' + FOR d IN circles + FILTER d._key == 'D' + FOR v, e IN OUTBOUND SHORTEST_PATH a TO d GRAPH 'traversalGraph' + RETURN [v._key, e._key] +`); + +db._query(` + FOR a IN circles + FILTER a._key == 'A' + FOR d IN circles + FILTER d._key == 'D' + FOR v, e IN OUTBOUND SHORTEST_PATH a TO d edges + RETURN [v._key, e._key] +`); +``` + +And finally clean it up again: + +```js +--- +name: GRAPHSP_99_drop_graph +description: '' +--- +var examples = require("@arangodb/graph-examples/example-graph"); +examples.dropGraph("traversalGraph"); +~removeIgnoreCollection("circles"); +~removeIgnoreCollection("edges"); +``` diff --git a/site/content/arangodb/oem/aql/graphs/traversals-explained.md b/site/content/arangodb/oem/aql/graphs/traversals-explained.md new file mode 100644 index 0000000000..b4e9741151 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/traversals-explained.md @@ -0,0 +1,85 @@ +--- +title: AQL graph traversals explained +menuTitle: Traversals explained +weight: 5 +description: >- + Traversing a graph means to follow edges connected to a start vertex and + neighboring vertices until a specified depth +--- +## General query idea + +A traversal starts at one specific document (*startVertex*) and follows all +edges connected to this document. For all documents (*vertices*) that are +targeted by these edges it will again follow all edges connected to them and +so on. It is possible to define how many of these follow iterations should be +executed at least (*min* depth) and at most (*max* depth). + +For all vertices that were visited during this process in the range between +*min* depth and *max* depth iterations you will get a result in form of a +set with three items: + +1. The visited vertex. +2. The edge pointing to it. +3. The complete path from startVertex to the visited vertex as object with an + attribute *edges* and an attribute *vertices*, each a list of the corresponding + elements. These lists are sorted, which means the first element in *vertices* + is the *startVertex* and the last is the visited vertex, and the n-th element + in *edges* connects the n-th element with the (n+1)-th element in *vertices*. + +## Example execution + +Let's take a look at a simple example to explain how it works. +This is the graph that we are going to traverse: + +![traversal graph](../../../../images/traversal_graph.png) + +We use the following parameters for our query: + +1. We start at the vertex **A**. +2. We use a *min* depth of 1. +3. We use a *max* depth of 2. +4. We follow only in `OUTBOUND` direction of edges + +![traversal graph step 1](../../../../images/traversal_graph1.png) + +Now it walks to one of the direct neighbors of **A**, say **B** (note: ordering +is not guaranteed!): + +![traversal graph step 2](../../../../images/traversal_graph2.png) + +The query will remember the state (red circle) and will emit the first result +**A** → **B** (black box). This will also prevent the traverser to be trapped +in cycles. Now again it will visit one of the direct neighbors of **B**, say **E**: + +![traversal graph step 3](../../../../images/traversal_graph3.png) + +We have limited the query with a *max* depth of *2*, so it will not pick any +neighbor of **E**, as the path from **A** to **E** already requires *2* steps. +Instead, we will go back one level to **B** and continue with any other direct +neighbor there: + +![traversal graph step 4](../../../../images/traversal_graph4.png) + +Again after we produced this result we will step back to **B**. +But there is no neighbor of **B** left that we have not yet visited. +Hence we go another step back to **A** and continue with any other neighbor there. + +![traversal graph step 5](../../../../images/traversal_graph5.png) + +And identical to the iterations before we will visit **H**: + +![traversal graph step 6](../../../../images/traversal_graph6.png) + +And **J**: + +![traversal graph step 7](../../../../images/traversal_graph7.png) + +After these steps there is no further result left. So all together this query +has returned the following paths: + +1. **A** → **B** +2. **A** → **B** → **E** +3. **A** → **B** → **C** +4. **A** → **G** +5. **A** → **G** → **H** +6. **A** → **G** → **J** diff --git a/site/content/arangodb/oem/aql/graphs/traversals.md b/site/content/arangodb/oem/aql/graphs/traversals.md new file mode 100644 index 0000000000..657fbf0917 --- /dev/null +++ b/site/content/arangodb/oem/aql/graphs/traversals.md @@ -0,0 +1,890 @@ +--- +title: Graph traversals in AQL +menuTitle: Traversals +weight: 10 +description: >- + You can traverse named graphs and anonymous graphs with a native AQL + language construct +--- +## Syntax + +There are two slightly different syntaxes for traversals in AQL, one for +- [named graphs](../../graphs/_index.md#named-graphs) and another to +- specify a [set of edge collections](#working-with-collection-sets) + ([anonymous graph](../../graphs/_index.md#anonymous-graphs)). + +### Working with named graphs + +The syntax for AQL graph traversals using named graphs is as follows +(square brackets denote optional parts and `|` denotes alternatives): + +```aql +FOR vertex[, edge[, path]] + IN [min[..max]] + OUTBOUND|INBOUND|ANY startVertex + GRAPH graphName + [PRUNE [pruneVariable = ]pruneCondition] + [OPTIONS options] +``` + +- `FOR`: emits up to three variables: + - **vertex** (object): the current vertex in a traversal + - **edge** (object, *optional*): the current edge in a traversal + - **path** (object, *optional*): representation of the current path with + two members: + - `vertices`: an array of all vertices on this path + - `edges`: an array of all edges on this path +- `IN` `min..max`: the minimal and maximal depth for the traversal: + - **min** (number, *optional*): edges and vertices returned by this query + start at the traversal depth of *min* (thus edges and vertices below it are + not returned). If not specified, it defaults to 1. The minimal + possible value is 0. + - **max** (number, *optional*): up to *max* length paths are traversed. + If omitted, *max* defaults to *min*. Thus only the vertices and edges in + the range of *min* are returned. *max* cannot be specified without *min*. +- `OUTBOUND|INBOUND|ANY`: follow outgoing, incoming, or edges pointing in either + direction in the traversal. Note that this can't be replaced by a bind parameter. +- **startVertex** (string\|object): a vertex where the traversal originates from. + This can be specified in the form of an ID string or in the form of a document + with the `_id` attribute. All other values lead to a warning and an empty + result. If the specified document does not exist, the result is empty as well + and there is no warning. +- `GRAPH` **graphName** (string): the name identifying the named graph. + Its vertex and edge collections are looked up. Note that the graph name + is like a regular string, hence it must be enclosed by quote marks, like + `GRAPH "graphName"`. +- `PRUNE` **expression** (AQL expression, *optional*): + An expression, like in a `FILTER` statement, which is evaluated in every step of + the traversal, as early as possible. The semantics of this expression are as follows: + - If the expression evaluates to `false`, the traversal continues on the current path. + - If the expression evaluates to `true`, the traversal does not continue on the + current path. However, the paths up to this point are considered as a result + (they might still be post-filtered or ignored due to depth constraints). + For example, a traversal over the graph `(A) -> (B) -> (C)` starting at `A` + and pruning on `B` results in `(A)` and `(A) -> (B)` being valid paths, + whereas `(A) -> (B) -> (C)` is not returned because it gets pruned on `B`. + + You can only use a single `PRUNE` clause per `FOR` traversal operation, but + the prune expression can contain an arbitrary number of conditions using `AND` + and `OR` statements for complex expressions. You can use the variables emitted + by the `FOR` operation in the prune expression, as well as all variables + defined before the traversal. + + You can optionally assign the prune expression to a variable like + `PRUNE var = ` to use the evaluated result elsewhere in the query, + typically in a `FILTER` expression. + + See [Pruning](#pruning) for details. +- `OPTIONS` **options** (object, *optional*): See the [traversal options](#traversal-options). + +### Working with collection sets + +The syntax for AQL graph traversals using collection sets is as follows +(square brackets denote optional parts and `|` denotes alternatives): + +```aql +[WITH vertexCollection1[, vertexCollection2[, vertexCollectionN]]] +FOR vertex[, edge[, path]] + IN [min[..max]] + OUTBOUND|INBOUND|ANY startVertex + edgeCollection1[, edgeCollection2[, edgeCollectionN]] + [PRUNE [pruneVariable = ]pruneCondition] + [OPTIONS options] +``` + +- `WITH`: Declaration of collections. Optional for single server instances, but + required for [graph traversals in a cluster](#graph-traversals-in-a-cluster). + Needs to be placed at the very beginning of the query. + - **collections** (collection, *repeatable*): list of vertex collections that + are involved in the traversal +- **edgeCollections** (collection, *repeatable*): One or more edge collections + to use for the traversal (instead of using a named graph with `GRAPH graphName`). + Vertex collections are determined by the edges in the edge collections. + + You can override the default traversal direction by setting `OUTBOUND`, + `INBOUND`, or `ANY` before any of the edge collections. + + If the same edge collection is specified multiple times, it behaves as if it + were specified only once. Specifying the same edge collection is only allowed + when the collections do not have conflicting traversal directions. + + Views cannot be used as edge collections. +- See the [named graph variant](#working-with-named-graphs) for the remaining + traversal parameters as well as the [traversal options](#traversal-options). + The `edgeCollections` restriction option is redundant in this case. + +### Traversal options + +You can optionally specify the following options to modify the execution of a +graph traversal. If you specify unknown options, query warnings are raised. + +#### `order` + +Specify which traversal algorithm to use (string): +- `"bfs"` – the traversal is executed breadth-first. The results + first contain all vertices at depth 1, then all vertices at depth 2 and so on. +- `"dfs"` (default) – the traversal is executed depth-first. It + first returns all paths from *min* depth to *max* depth for one vertex at + depth 1, then for the next vertex at depth 1 and so on. +- `"weighted"` - the traversal is a weighted traversal + (introduced in v3.8.0). Paths are enumerated with increasing cost. + Also see `weightAttribute` and `defaultWeight`. A returned path has an + additional attribute `weight` containing the cost of the path after every + step. The order of paths having the same cost is non-deterministic. + Negative weights are not supported and abort the query with an error. + +#### `bfs` + +Deprecated, use `order: "bfs"` instead. + +#### `uniqueVertices` + +Ensure vertex uniqueness (string): + +- `"path"` – it is guaranteed that there is no path returned with a duplicate vertex +- `"global"` – it is guaranteed that each vertex is visited at most once during + the traversal, no matter how many paths lead from the start vertex to this one. + If you start with a `min depth > 1` a vertex that was found before *min* depth + might not be returned at all (it still might be part of a path). + It is required to set `order: "bfs"` or `order: "weighted"` because with + depth-first search the results would be unpredictable. **Note:** + Using this configuration the result is not deterministic any more. If there + are multiple paths from *startVertex* to *vertex*, one of those is picked. + In case of a `weighted` traversal, the path with the lowest weight is + picked, but in case of equal weights it is undefined which one is chosen. +- `"none"` (default) – no uniqueness check is applied on vertices + +#### `uniqueEdges` + +Ensure edge uniqueness (string): + +- `"path"` (default) – it is guaranteed that there is no path returned with a + duplicate edge +- `"none"` – no uniqueness check is applied on edges. **Note:** + Using this configuration, the traversal follows edges in cycles. + +#### `edgeCollections` + +Restrict edge collections the traversal may visit (string\|array). + +If omitted or an empty array is specified, then there are no restrictions. + +- A string parameter is treated as the equivalent of an array with a single + element. +- Each element of the array should be a string containing the name of an + edge collection. + +#### `vertexCollections` + +Restrict vertex collections the traversal may visit (string\|array). + +If omitted or an empty array is specified, then there are no restrictions. + +- A string parameter is treated as the equivalent of an array with a single + element. +- Each element of the array should be a string containing the name of a + vertex collection. +- The starting vertex is always allowed, even if it does not belong to one + of the collections specified by a restriction. + +#### `parallelism` + +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +Parallelize traversal execution (number). + +If omitted or set to a value of `1`, the traversal execution is not parallelized. +If set to a value greater than `1`, then up to that many worker threads can be +used for concurrently executing the traversal. The value is capped by the number +of available cores on the target machine. + +Parallelizing a traversal is normally useful when there are many inputs (start +vertices) that the nested traversal can work on concurrently. This is often the +case when a nested traversal is fed with several tens of thousands of start +vertices, which can then be distributed randomly to worker threads for parallel +execution. + +#### `maxProjections` + +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +Specifies the number of document attributes per `FOR` loop to be used as +projections (number). The default value is `5`. + +#### `weightAttribute` + +Specifies the name of an attribute that is used to look up the weight of an edge +(string). + +If no attribute is specified or if it is not present in the edge document then +the `defaultWeight` is used. + +The attribute value must not be negative. + +{{< info >}} +Weighted traversals do not support negative weights. If a document +attribute (as specified by `weightAttribute`) with a negative value is +encountered during traversal, the query is aborted with an error. +{{< /info >}} + +#### `defaultWeight` + +Specifies the default weight of an edge (number). The default value is `1`. + +The value must not be negative. + +{{< info >}} +Weighted traversals do not support negative weights. If `defaultWeight` is set +to a negative number, then the query is aborted with an error. +{{< /info >}} + +### Traversing in mixed directions + +For traversals with a list of edge collections you can optionally specify the +direction for some of the edge collections. Say for example you have three edge +collections *edges1*, *edges2* and *edges3*, where in *edges2* the direction has +no relevance but in *edges1* and *edges3* the direction should be taken into account. +In this case you can use `OUTBOUND` as general traversal direction and `ANY` +specifically for *edges2* as follows: + +```aql +FOR vertex IN OUTBOUND + startVertex + edges1, ANY edges2, edges3 +``` + +All collections in the list that do not specify their own direction use the +direction defined after `IN`. This allows to use a different direction for each +collection in your traversal. + +### Graph traversals in a cluster + +Due to the nature of graphs, edges may reference vertices from arbitrary +collections. Following the paths can thus involve documents from various +collections and it is not possible to predict which are visited in a +traversal. Which collections need to be loaded by the graph engine can only be +determined at run time. + +Use the [`WITH` statement](../high-level-operations/with.md) to specify the collections you +expect to be involved. This is required for traversals using collection sets +in cluster deployments. + +## Pruning + +You can define stop conditions for graph traversals to return specific data and +to improve the query performance. This is called _pruning_ and works by checking +conditions during the traversal as opposed to filtering the results afterwards +(post-filtering). This reduces the amount of data to be checked by stopping the +traversal down specific paths early. + +{{< youtube id="4LVeeC0ciCQ" >}} + +You can specify one `PRUNE` expression per graph traversal, but it can contain +an arbitrary number of conditions. You can use the vertex, edge, and path +variables emitted by the traversal in a prune expression, as well as all other +variables defined before the `FOR` operation. Note that `PRUNE` is an optional +clause of the `FOR` operation and that the `OPTIONS` clause needs to be placed +after `PRUNE`. + +```aql +--- +name: GRAPHTRAV_graphPruneExample1 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 0..10 OUTBOUND "places/Toronto" GRAPH "kShortestPathsGraph" + PRUNE v.label == "Edmonton" + OPTIONS { uniqueVertices: "path" } + RETURN CONCAT_SEPARATOR(" -- ", p.vertices[*].label) +``` + +The above example shows a graph traversal using a +[train station and connections dataset](../../graphs/example-graphs.md#k-shortest-paths-graph): + +![Train Connection Map](../../../../images/train_map.png) + +The traversal starts at **Toronto** (bottom left), the traversal depth is +limited to 10, and every station is only visited once. The traversal could +continue up to **Vancouver** (bottom right) at depth 5, but it is stopped early +on this path (the only path in this example) at **Edmonton** because of the +prune expression. + +The traversal along paths is stopped as soon as the prune expression evaluates +to `true` for a given path. The current depth is still included in the result, +however. This can be seen in the query result of the example which includes the +Edmonton vertex at which it stopped. + +The following example starts a traversal at **London** (middle right), with a +depth between 2 and 3, and every station is only visited once. The station names +as well as the travel times are returned: + +```aql +--- +name: GRAPHTRAV_graphPruneExample2 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + OPTIONS { uniqueVertices: "path" } + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +The same example with an added prune expression, with vertex and edge conditions: + +```aql +--- +name: GRAPHTRAV_graphPruneExample3 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + PRUNE v.label == "Carlisle" OR e.travelTime > 3 + OPTIONS { uniqueVertices: "path" } + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +If either the **Carlisle** vertex or an edge with a travel time of over three +hours is encountered, the subsequent paths are pruned. In the example, this +removes the train connections to **Birmingham**, **Glasgow**, and **York**, +which come after **Carlisle**, as well as the connections to and via +**Edinburgh** because of the four hour duration for the section from **York** +to **Edinburgh**. + +If your graph is comprised of multiple vertex or edge collections, you can +also prune as soon as you reach a certain collection, using a condition like +`PRUNE IS_SAME_COLLECTION("stopCollection", v)`. + +If you want to only return the results of the depth at which the traversal +stopped due to the prune expression, you can use a `FILTER` in addition. You can +assign the evaluated result of a prune expression to a variable +(`PRUNE var = `) and use it for filtering: + +```aql +--- +name: GRAPHTRAV_graphPruneExample4 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + PRUNE cond = v.label == "Carlisle" OR e.travelTime > 3 + OPTIONS { uniqueVertices: "path" } + FILTER cond + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +Only paths that end at **Carlisle** or with the last edge having a travel time +of over three hours are returned. This excludes the connection to **Cologne** +from the results compared to the previous query. + +If you want to exclude the depth at which the prune expression stopped the +traversal, you can assign the expression to a variable and use its negated value +in a `FILTER`: + +```aql +--- +name: GRAPHTRAV_graphPruneExample5 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + PRUNE cond = v.label == "Carlisle" OR e.travelTime > 3 + OPTIONS { uniqueVertices: "path" } + FILTER NOT cond + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +This only returns the connection to **Cologne**, which is the opposite of the +previous example. + +You may combine the prune variable with arbitrary other conditions in a `FILTER` +operation. For example, you can remove results where the last edge has as lower +travel time than the second to last edge of the path: + +```aql +--- +name: GRAPHTRAV_graphPruneExample6 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 2..5 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + PRUNE cond = v.label == "Carlisle" OR e.travelTime > 3 + OPTIONS { uniqueVertices: "path" } + FILTER cond AND p.edges[-1].travelTime >= p.edges[-2].travelTime + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +{{< info >}} +The prune expression is **evaluated at every step of the traversal**. This +includes any traversal depths below the specified minimum depth, despite not +becoming part of the result. It also includes depth 0, which is the start vertex +and a `null` edge. + +If you add prune conditions using the edge variable, make sure to account for +the edge at depth 0 being `null`, as it may accidentally stop the traversal +immediately. This may not be apparent due to the depth constraints. +{{< /info >}} + +The following examples shows a graph traversal starting at **London**, with a +traversal depth between 2 and 3, and every station is only visited once: + +```aql +--- +name: GRAPHTRAV_graphPruneExample7 +description: '' +dataset: kShortestPathsGraph +--- +FOR v, e, p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + OPTIONS { uniqueVertices: "path" } + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +If you add prune conditions to stop the traversal if the station is **Glasgow** +or the travel time less than some number, no results are turned. This is even the +case for a value of `2.5`, for which two paths exist that fulfill the criterion +– to **Cologne** and **Carlisle**: + +```aql +--- +name: GRAPHTRAV_graphPruneExample8 +description: '' +dataset: kShortestPathsGraph +--- +FOR v,e,p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + PRUNE v.label == "Glasgow" OR e.travelTime < 2.5 + OPTIONS { uniqueVertices: "path" } + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +The problem is that `null`, `false`, and `true` are all less than any number (`< 2.5`) +because of AQL's [Type and value order](../fundamentals/type-and-value-order.md), and +because the edge at depth 0 is always `null`. The prune condition is accidentally +fulfilled at the start vertex, stopping the traversal too early. This similarly +happens if you check an edge attribute for inequality (`!=`) and compare it to +string, for instance, which evaluates to `true` for the `null` value. + +The depth at which a traversal is stopped by pruning is considered as a result, +but in the above example, the minimum depth of `2` filters the start vertex out. +If you lower the minimum depth to `0`, you get **London** as the sole result. +This confirms that the traversal stopped at the start vertex. + +To avoid this problem, exclude the `null` value. For example, you can use +`e.travelTime > 0 AND e.travelTime < 2.5`, but more generic solutions are to +exclude depth 0 from the check (`LENGTH(p.edges) > 0`) or to simply ignore the +`null` edge (`e != null`): + +```aql +--- +name: GRAPHTRAV_graphPruneExample9 +description: '' +dataset: kShortestPathsGraph +--- +FOR v,e,p IN 2..3 OUTBOUND "places/London" GRAPH "kShortestPathsGraph" + PRUNE v.label == "Glasgow" OR (e != null AND e.travelTime < 2.5) + OPTIONS { uniqueVertices: "path" } + RETURN CONCAT_SEPARATOR(" -- ", INTERLEAVE(p.vertices[*].label, p.edges[*].travelTime)) +``` + +{{< warning >}} +You can use AQL functions in prune expressions but only those that can be +executed on DB-Servers, regardless of your deployment mode. The following +functions cannot be used in the expression: +- `CALL()` +- `APPLY()` +- `DOCUMENT()` +- `V8()` +- `SCHEMA_GET()` +- `SCHEMA_VALIDATE()` +- `VERSION()` +- `COLLECTIONS()` +- `CURRENT_USER()` +- `CURRENT_DATABASE()` +- `COLLECTION_COUNT()` +- `NEAR()` +- `WITHIN()` +- `WITHIN_RECTANGLE()` +- `FULLTEXT()` +- [User-defined functions (UDFs)](../user-defined-functions.md) +{{< /warning >}} + +## Using filters + +All three variables emitted by the traversals might as well be used in filter +statements. For some of these filter statements the optimizer can detect that it +is possible to prune paths of traversals earlier, hence filtered results are +not emitted to the variables in the first place. This may significantly +improve the performance of your query. Whenever a filter is not fulfilled, +the complete set of `vertex`, `edge` and `path` is skipped. All paths +with a length greater than the `max` depth are never computed. + +Filter conditions that are `AND`-combined can be optimized, but `OR`-combined +conditions cannot. + +### Filtering on paths + +Filtering on paths allows for the second most powerful filtering and may have the +second highest impact on performance. Using the path variable you can filter on +specific iteration depths. You can filter for absolute positions in the path +by specifying a positive number (which then qualifies for the optimizations), +or relative positions to the end of the path by specifying a negative number. + +#### Filtering edges on the path + +This example traversal filters all paths where the start edge (index 0) has the +attribute `theTruth` equal to `true`. The resulting paths are up to 5 items long: + +```aql +--- +name: GRAPHTRAV_graphFilterEdges +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..5 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[0].theTruth == true + RETURN { vertices: p.vertices[*]._key, edges: p.edges[*].label } +``` + +#### Filtering vertices on the path + +Similar to filtering the edges on the path, you can also filter the vertices: + +```aql +--- +name: GRAPHTRAV_graphFilterVertices +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..5 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.vertices[1]._key == "G" + RETURN { vertices: p.vertices[*]._key, edges: p.edges[*].label } +``` + +#### Combining several filters + +You can combine filters in any way you like: + +```aql +--- +name: GRAPHTRAV_graphFilterCombine +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..5 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[0].theTruth == true + AND p.edges[1].theFalse == false + FILTER p.vertices[1]._key == "G" + RETURN { vertices: p.vertices[*]._key, edges: p.edges[*].label } +``` + +The query filters all paths where the first edge has the attribute +`theTruth` equal to `true`, the first vertex is `"G"` and the second edge has +the attribute `theFalse` equal to `false`. The resulting paths are up to +5 items long. + +**Note**: Despite the `min` depth of 1, this only returns results of +depth 2. This is because for all results in depth 1, the second edge does not +exist and hence cannot fulfill the condition here. + +#### Filter on the entire path + +With the help of array comparison operators filters can also be defined +on the entire path, like `ALL` edges should have `theTruth == true`: + +```aql +--- +name: GRAPHTRAV_graphFilterEntirePath +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..5 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[*].theTruth ALL == true + RETURN { vertices: p.vertices[*]._key, edges: p.edges[*].label } +``` + +Or `NONE` of the edges should have `theTruth == true`: + +```aql +--- +name: GRAPHTRAV_graphFilterPathEdges +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..5 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[*].theTruth NONE == true + RETURN { vertices: p.vertices[*]._key, edges: p.edges[*].label } +``` + +Both examples above are recognized by the optimizer and can potentially use other indexes +than the edge index. + +It is also possible to define that at least one edge on the path has to fulfill the condition: + +```aql +--- +name: GRAPHTRAV_graphFilterPathAnyEdge +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..5 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[*].theTruth ANY == true + RETURN { vertices: p.vertices[*]._key, edges: p.edges[*].label } +``` + +It is guaranteed that at least one, but potentially more edges fulfill the condition. +All of the above filters can be defined on vertices in the exact same way. + +### Filtering on the path vs. filtering on vertices or edges + +Filtering on the path influences the Iteration on your graph. If certain conditions +aren't met, the traversal may stop continuing along this path. + +In contrast filters on vertex or edge only express whether you're interested in the actual value of these +documents. Thus, it influences the list of returned documents (if you return v or e) similar +as specifying a non-null `min` value. If you specify a min value of 2, the traversal over the first +two nodes of these paths has to be executed - you just won't see them in your result array. + +Similar are filters on vertices or edges - the traverser has to walk along these nodes, since +you may be interested in documents further down the path. + +### Examples + +Create a simple symmetric traversal demonstration graph: + +![traversal graph](../../../../images/traversal_graph.png) + +```js +--- +name: GRAPHTRAV_01_create_graph +description: '' +--- +~addIgnoreCollection("circles"); +~addIgnoreCollection("edges"); +var examples = require("@arangodb/graph-examples/example-graph"); +var graph = examples.loadGraph("traversalGraph"); +db.circles.toArray(); +db.edges.toArray(); +print("once you don't need them anymore, clean them up:"); +examples.dropGraph("traversalGraph"); +``` + +To get started we select the full graph. For better overview we only return +the vertex IDs: + +```aql +--- +name: GRAPHTRAV_02_traverse_all_a +description: '' +dataset: traversalGraph +--- +FOR v IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_02_traverse_all_b +description: '' +dataset: traversalGraph +--- +FOR v IN 1..3 OUTBOUND 'circles/A' edges RETURN v._key +``` + +We can nicely see that it is heading for the first outer vertex, then goes back to +the branch to descend into the next tree. After that it returns to our start node, +to descend again. As we can see both queries return the same result, the first one +uses the named graph, the second uses the edge collections directly. + +Now we only want the elements of a specific depth (min = max = 2), the ones that +are right behind the fork: + +```aql +--- +name: GRAPHTRAV_03_traverse_3a +description: '' +dataset: traversalGraph +--- +FOR v IN 2..2 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_03_traverse_3b +description: '' +dataset: traversalGraph +--- +FOR v IN 2 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + RETURN v._key +``` + +As you can see, we can express this in two ways: with or without the `max` depth +parameter. + +### Filter examples + +Now let's start to add some filters. We want to cut of the branch on the right +side of the graph, we may filter in two ways: + +- we know the vertex at depth 1 has `_key` == `G` +- we know the `label` attribute of the edge connecting **A** to **G** is `right_foo` + +```aql +--- +name: GRAPHTRAV_04_traverse_4a +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.vertices[1]._key != 'G' + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_04_traverse_4b +description: '' +dataset: traversalGraph +--- +FOR v, e, p IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[0].label != 'right_foo' + RETURN v._key +``` + +As we can see, all vertices behind **G** are skipped in both queries. +The first filters on the vertex `_key`, the second on an edge label. +Note again, as soon as a filter is not fulfilled for any of the three elements +`v`, `e` or `p`, the complete set of these is excluded from the result. + +We also may combine several filters, for instance to filter out the right branch +(**G**), and the **E** branch: + +```aql +--- +name: GRAPHTRAV_05_traverse_5a +description: '' +dataset: traversalGraph +--- +FOR v,e,p IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.vertices[1]._key != 'G' + FILTER p.edges[1].label != 'left_blub' + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_05_traverse_5b +description: '' +dataset: traversalGraph +--- +FOR v,e,p IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.vertices[1]._key != 'G' AND p.edges[1].label != 'left_blub' + RETURN v._key +``` + +As you can see, combining two `FILTER` statements with an `AND` has the same result. + +## Comparing OUTBOUND / INBOUND / ANY + +All our previous examples traversed the graph in `OUTBOUND` edge direction. +You may however want to also traverse in reverse direction (`INBOUND`) or +both (`ANY`). Since `circles/A` only has outbound edges, we start our queries +from `circles/E`: + +```aql +--- +name: GRAPHTRAV_06_traverse_6a +description: '' +dataset: traversalGraph +--- +FOR v IN 1..3 OUTBOUND 'circles/E' GRAPH 'traversalGraph' + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_06_traverse_6b +description: '' +dataset: traversalGraph +--- +FOR v IN 1..3 INBOUND 'circles/E' GRAPH 'traversalGraph' + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_06_traverse_6c +description: '' +dataset: traversalGraph +--- +FOR v IN 1..3 ANY 'circles/E' GRAPH 'traversalGraph' + RETURN v._key +``` + +The first traversal only walks in the forward (`OUTBOUND`) direction. +Therefore from **E** we only can see **F**. Walking in reverse direction +(`INBOUND`), we see the path to **A**: **B** → **A**. + +Walking in forward and reverse direction (`ANY`) we can see a more diverse result. +First of all, we see the simple paths to **F** and **A**. However, these vertices +have edges in other directions and they are traversed. + +**Note**: The traverser may use identical edges multiple times. For instance, +if it walks from **E** to **F**, it continues to walk from **F** to **E** +using the same edge once again. Due to this, we see duplicate nodes in the result. + +Please note that the direction can't be passed in by a bind parameter. + +## Use the AQL explainer for optimizations + +Now let's have a look what the optimizer does behind the curtain and inspect +traversal queries using [the explainer](../execution-and-performance/query-optimization.md): + +```aql +--- +name: GRAPHTRAV_07_traverse_7 +description: '' +dataset: traversalGraph +explain: true +--- +FOR v,e,p IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + LET localScopeVar = RAND() > 0.5 + FILTER p.edges[0].theTruth != localScopeVar + RETURN v._key +``` + +```aql +--- +name: GRAPHTRAV_07_traverse_8 +description: '' +dataset: traversalGraph +explain: true +--- +FOR v,e,p IN 1..3 OUTBOUND 'circles/A' GRAPH 'traversalGraph' + FILTER p.edges[0].label == 'right_foo' + RETURN v._key +``` + +We now see two queries: In one we add a `localScopeVar` variable, which is outside +the scope of the traversal itself - it is not known inside of the traverser. +Therefore, this filter can only be executed after the traversal, which may be +undesired in large graphs. The second query on the other hand only operates on the +path, and therefore this condition can be used during the execution of the traversal. +Paths that are filtered out by this condition won't be processed at all. + +And finally clean it up again: + +```js +--- +name: GRAPHTRAV_99_drop_graph +description: '' +--- +~examples.loadGraph("traversalGraph"); +var examples = require("@arangodb/graph-examples/example-graph"); +examples.dropGraph("traversalGraph"); +``` + +If this traversal is not powerful enough for your needs, like you cannot describe +your conditions as AQL filter statements, then you might want to have a look at +the [edge collection methods](../../develop/javascript-api/@arangodb/collection-object.md#edge-documents) +in the JavaScript API. + +Also see how to [combine graph traversals](../examples-and-query-patterns/traversals.md). diff --git a/site/content/arangodb/oem/aql/high-level-operations/_index.md b/site/content/arangodb/oem/aql/high-level-operations/_index.md new file mode 100644 index 0000000000..78432ed69f --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/_index.md @@ -0,0 +1,9 @@ +--- +title: High-level AQL operations +menuTitle: High-level Operations +weight: 25 +description: >- + High-level operations are the core language constructs of the query language + to perform actions like finding and returning data, as well as creating and + modifying documents +--- diff --git a/site/content/arangodb/oem/aql/high-level-operations/collect.md b/site/content/arangodb/oem/aql/high-level-operations/collect.md new file mode 100644 index 0000000000..cdcc8dcd5a --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/collect.md @@ -0,0 +1,375 @@ +--- +title: '`COLLECT` operation in AQL' +menuTitle: COLLECT +weight: 40 +description: >- + The `COLLECT` operation can group data by one or multiple grouping criteria, + retrieve all distinct values, count how often values occur, and calculate + statistical properties efficiently +--- +The different variants of `COLLECT` cover most needs for grouping and aggregating +data. For aggregation using a sliding window, see the [`WINDOW` operation](window.md). + +## Syntax + +There are several syntax variants for `COLLECT` operations: + +
COLLECT variableName = expression
+COLLECT variableName = expression INTO groupsVariable
+COLLECT variableName = expression INTO groupsVariable = projectionExpression
+COLLECT variableName = expression INTO groupsVariable KEEP keepVariable
+COLLECT variableName = expression WITH COUNT INTO countVariable
+COLLECT variableName = expression AGGREGATE variableName = aggregateExpression
+COLLECT variableName = expression AGGREGATE variableName = aggregateExpression INTO groupsVariable
+COLLECT AGGREGATE variableName = aggregateExpression
+COLLECT AGGREGATE variableName = aggregateExpression INTO groupsVariable
+COLLECT WITH COUNT INTO countVariable
+ +All variants can optionally end with an `OPTIONS { … }` clause. + +{{< info >}} +The `COLLECT` operation eliminates all local variables in the current scope. +After a `COLLECT`, only the variables introduced by `COLLECT` itself are available. +{{< /info >}} + +## Grouping syntaxes + +The first syntax form of `COLLECT` only groups the result by the defined group +criteria specified in *expression*. In order to further process the results +produced by `COLLECT`, a new variable (specified by *variableName*) is introduced. +This variable contains the group value. + +Here's an example query that find the distinct values in `u.city` and makes +them available in variable `city`: + +```aql +FOR u IN users + COLLECT city = u.city + RETURN { + "city" : city + } +``` + +The second form does the same as the first form, but additionally introduces a +variable (specified by *groupsVariable*) that contains all elements that fell into the +group. This works as follows: The *groupsVariable* variable is an array containing +as many elements as there are in the group. Each member of that array is +a JSON object in which the value of every variable that is defined in the +AQL query is bound to the corresponding attribute. Note that this considers +all variables that are defined before the `COLLECT` statement, but not those on +the top level (outside of any `FOR`), unless the `COLLECT` statement is itself +on the top level, in which case all variables are taken. Furthermore note +that it is possible that the optimizer moves `LET` statements out of `FOR` +statements to improve performance. + +```aql +FOR u IN users + COLLECT city = u.city INTO groups + RETURN { + "city" : city, + "usersInCity" : groups + } +``` + +In the above example, the array `users` will be grouped by the attribute +`city`. The result is a new array of documents, with one element per distinct +`u.city` value. The elements from the original array (here: `users`) per city are +made available in the variable `groups`. This is due to the `INTO` clause. + +`COLLECT` also allows specifying multiple group criteria. Individual group +criteria can be separated by commas: + +```aql +FOR u IN users + COLLECT country = u.country, city = u.city INTO groups + RETURN { + "country" : country, + "city" : city, + "usersInCity" : groups + } +``` + +In the above example, the array `users` is grouped by country first and then +by city, and for each distinct combination of country and city, the users +will be returned. + +## Discarding obsolete variables + +The third form of `COLLECT` allows rewriting the contents of the *groupsVariable* +using an arbitrary *projectionExpression*: + +```aql +FOR u IN users + COLLECT country = u.country, city = u.city INTO groups = u.name + RETURN { + "country" : country, + "city" : city, + "userNames" : groups + } +``` + +In the above example, only the *projectionExpression* is `u.name`. Therefore, +only this attribute is copied into the *groupsVariable* for each document. +This is probably much more efficient than copying all variables from the scope into +the *groupsVariable* as it would happen without a *projectionExpression*. + +The expression following `INTO` can also be used for arbitrary computations: + +```aql +FOR u IN users + COLLECT country = u.country, city = u.city INTO groups = { + "name" : u.name, + "isActive" : u.status == "active" + } + RETURN { + "country" : country, + "city" : city, + "usersInCity" : groups + } +``` + +`COLLECT` also provides an optional `KEEP` clause that can be used to control +which variables will be copied into the variable created by `INTO`. If no +`KEEP` clause is specified, all variables from the scope will be copied as +sub-attributes into the *groupsVariable*. +This is safe but can have a negative impact on performance if there +are many variables in scope or the variables contain massive amounts of data. + +The following example limits the variables that are copied into the *groupsVariable* +to just `name`. The variables `u` and `someCalculation` also present in the scope +will not be copied into *groupsVariable* because they are not listed in the `KEEP` clause: + +```aql +FOR u IN users + LET name = u.name + LET someCalculation = u.value1 + u.value2 + COLLECT city = u.city INTO groups KEEP name + RETURN { + "city" : city, + "userNames" : groups[*].name + } +``` + +`KEEP` is only valid in combination with `INTO`. Only valid variable names can +be used in the `KEEP` clause. `KEEP` supports the specification of multiple +variable names. + +## Group length calculation + +`COLLECT` also provides a special `WITH COUNT` clause that can be used to +determine the number of group members efficiently. + +The simplest form just returns the number of items that made it into the +`COLLECT`: + +```aql +FOR u IN users + COLLECT WITH COUNT INTO length + RETURN length +``` + +The above is equivalent to, but less efficient than: + +```aql +RETURN LENGTH(users) +``` + +The `WITH COUNT` clause can also be used to efficiently count the number +of items in each group: + +```aql +FOR u IN users + COLLECT age = u.age WITH COUNT INTO length + RETURN { + "age" : age, + "count" : length + } +``` + +{{< info >}} +The `WITH COUNT` clause can only be used together with an `INTO` clause. +{{< /info >}} + +## Aggregation + +A `COLLECT` statement can be used to perform aggregation of data per group. To +only determine group lengths, the `WITH COUNT INTO` variant of `COLLECT` can be +used as described before. + +For other aggregations, it is possible to run aggregate functions on the `COLLECT` +results: + +```aql +FOR u IN users + COLLECT ageGroup = FLOOR(u.age / 5) * 5 INTO g + RETURN { + "ageGroup" : ageGroup, + "minAge" : MIN(g[*].u.age), + "maxAge" : MAX(g[*].u.age) + } +``` + +The above however requires storing all group values during the collect operation for +all groups, which can be inefficient. + +The special `AGGREGATE` variant of `COLLECT` allows building the aggregate values +incrementally during the collect operation, and is therefore often more efficient. + +With the `AGGREGATE` variant the above query becomes: + +```aql +FOR u IN users + COLLECT ageGroup = FLOOR(u.age / 5) * 5 + AGGREGATE minAge = MIN(u.age), maxAge = MAX(u.age) + RETURN { + ageGroup, + minAge, + maxAge + } +``` + +The `AGGREGATE` keyword can only be used after the `COLLECT` keyword. If used, it +must directly follow the declaration of the grouping keys. If no grouping keys +are used, it must follow the `COLLECT` keyword directly: + +```aql +FOR u IN users + COLLECT AGGREGATE minAge = MIN(u.age), maxAge = MAX(u.age) + RETURN { + minAge, + maxAge + } +``` + +Only specific expressions are allowed on the right-hand side of each `AGGREGATE` +assignment: + +- on the top level, an aggregate expression must be a call to one of the + supported aggregation functions: + - `LENGTH()` / `COUNT()` + - `MIN()` + - `MAX()` + - `SUM()` + - `AVERAGE()` / `AVG()` + - `STDDEV_POPULATION()` / `STDDEV()` + - `STDDEV_SAMPLE()` + - `VARIANCE_POPULATION()` / `VARIANCE()` + - `VARIANCE_SAMPLE()` + - `UNIQUE()` + - `SORTED_UNIQUE()` + - `COUNT_DISTINCT()` / `COUNT_UNIQUE()` + - `BIT_AND()` + - `BIT_OR()` + - `BIT_XOR()` + +- an aggregate expression must not refer to variables introduced by the `COLLECT` itself + +## `COLLECT` vs. `RETURN DISTINCT` + +In order to make a result set unique, one can either use `COLLECT` or +`RETURN DISTINCT`. + +```aql +FOR u IN users + RETURN DISTINCT u.age +``` + +```aql +FOR u IN users + COLLECT age = u.age + RETURN age +``` + +Behind the scenes, both variants create a *CollectNode*. However, they use +different implementations of `COLLECT` that have different properties: + +- `RETURN DISTINCT` **maintains the order of results**, but it is limited to + a single value. + +- `COLLECT` **changes the order of results** (sorted or undefined), but it + supports multiple values and is more flexible than `RETURN DISTINCT`. + +Aside from `COLLECT`s sophisticated grouping and aggregation capabilities, it +allows you to place a `LIMIT` operation before `RETURN` to potentially stop the +`COLLECT` operation early. + +## `COLLECT` options + +### `method` + +There are two variants of `COLLECT` that the optimizer can choose from: +the *sorted* and the *hash* variant. The `method` option can be used in a +`COLLECT` statement to inform the optimizer about the preferred method, +`"sorted"` or `"hash"`. + +```aql +COLLECT ... OPTIONS { method: "sorted" } +``` + +If no method is specified by the user, then the optimizer will create a plan +that uses the *sorted* method, and an additional plan using the *hash* method +if the `COLLECT` statement qualifies for it. + +If the method is explicitly set to *sorted*, then the optimizer will always use +the *sorted* variant of `COLLECT` and not even create a plan using the *hash* +variant. If it is explicitly set to *hash*, then the optimizer will create a +plan using the *hash* method **only if the `COLLECT` statement qualifies**. +Not all `COLLECT` statements can use the *hash* method, in particular ones that +do not perform any grouping. In case the `COLLECT` statement qualifies, +there will only be one plan that uses the *hash* method. Otherwise, the +optimizer will default to the *sorted* method. + +The *sorted* method requires its input to be sorted by the group criteria +specified in the `COLLECT` clause. To ensure correctness of the result, the +optimizer will automatically insert a `SORT` operation into the query in front +of the `COLLECT` statement. The optimizer may be able to optimize away that +`SORT` operation later if a sorted index is present on the group criteria. + +In case a `COLLECT` statement qualifies for using the *hash* variant, the +optimizer will create an extra plan for it at the beginning of the planning +phase. In this plan, no extra `SORT` statement will be added in front of the +`COLLECT`. This is because the *hash* variant of `COLLECT` does not require +sorted input. Instead, a `SORT` statement will be added after the `COLLECT` to +sort its output. This `SORT` statement may be optimized away again in later +stages. + +If the sort order of the `COLLECT` is irrelevant to the user, adding the extra +instruction `SORT null` after the `COLLECT` will allow the optimizer to remove +the sorts altogether: + +```aql +FOR u IN users + COLLECT age = u.age + SORT null /* note: will be optimized away */ + RETURN age +``` + +Which `COLLECT` variant is used by the optimizer if no preferred method is set +explicitly depends on the optimizer's cost estimations. The created plans with +the different `COLLECT` variants will be shipped through the regular +optimization pipeline. In the end, the optimizer will pick the plan with the +lowest estimated total cost as usual. + +In general, the *sorted* variant of `COLLECT` should be preferred in cases when +there is a sorted index present on the group criteria. In this case the +optimizer can eliminate the `SORT` operation in front of the `COLLECT`, so that +no `SORT` will be left. + +If there is no sorted index available on the group criteria, the up-front sort +required by the *sorted* variant can be expensive. In this case it is likely +that the optimizer will prefer the *hash* variant of `COLLECT`, which does not +require its input to be sorted. + +Which variant of `COLLECT` will actually be used can be figured out by looking +at the execution plan of a query, specifically the comment of the *CollectNode*: + +```aql +Execution plan: + Id NodeType Est. Comment + 1 SingletonNode 1 * ROOT + 2 EnumerateCollectionNode 5 - FOR doc IN coll /* full collection scan, projections: `name` */ + 3 CalculationNode 5 - LET #2 = doc.`name` /* attribute expression */ /* collections used: doc : coll */ + 4 CollectNode 5 - COLLECT name = #2 /* hash */ + 6 SortNode 5 - SORT name ASC /* sorting strategy: standard */ + 5 ReturnNode 5 - RETURN name +``` diff --git a/site/content/arangodb/oem/aql/high-level-operations/filter.md b/site/content/arangodb/oem/aql/high-level-operations/filter.md new file mode 100644 index 0000000000..71fdd19cb2 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/filter.md @@ -0,0 +1,125 @@ +--- +title: '`FILTER` operation in AQL' +menuTitle: FILTER +weight: 15 +description: >- + The `FILTER` operation lets you restrict the results to elements that match + arbitrary logical conditions +--- +## Syntax + +
FILTER expression
+ +*expression* must be a condition that evaluates to either `false` or `true`. + +## Usage + +If the condition result is false, the current element is skipped, so it will +not be processed further and not be part of the result. If the condition is +true, the current element is not skipped and can be further processed. + +See [Operators](../operators.md) for a list of comparison operators, logical +operators etc. that you can use in conditions. + +```aql +FOR u IN users + FILTER u.active == true && u.age < 39 + RETURN u +``` + +It is allowed to specify multiple `FILTER` statements in a query, even in +the same block. If multiple `FILTER` statements are used, their results will be +combined with a logical `AND`, meaning all filter conditions must be true to +include an element. + +```aql +FOR u IN users + FILTER u.active == true + FILTER u.age < 39 + RETURN u +``` + +In the above example, all array elements of `users` that have an attribute +`active` with value `true` and that have an attribute `age` with a value less +than `39` (including `null` ones) will be included in the result. All other +elements of `users` will be skipped and not be included in the result produced +by `RETURN`. + +{{< info >}} +See [Accessing Data from Collections](../fundamentals/accessing-data-from-collections.md) +for a description of the impact of non-existent or null attributes. +{{< /info >}} + +While `FILTER` typically occurs in combination with `FOR`, it can also be used +at the top level or in subqueries without a surrounding `FOR` loop. + +```aql +FILTER false +RETURN ASSERT(false, "never reached") +``` + +## Order of operations + +Note that the positions of `FILTER` statements can influence the result of a query. +There are 16 active users in the [test data](../examples-and-query-patterns/_index.md#example-data) +for instance: + +```aql +FOR u IN users + FILTER u.active == true + RETURN u +``` + +We can limit the result set to 5 users at most: + +```aql +FOR u IN users + FILTER u.active == true + LIMIT 5 + RETURN u +``` + +This may return the user documents of Jim, Diego, Anthony, Michael and Chloe for +instance. Which ones are returned is undefined, since there is no `SORT` statement +to ensure a particular order. If we add a second `FILTER` statement to only return +women... + +```aql +FOR u IN users + FILTER u.active == true + LIMIT 5 + FILTER u.gender == "f" + RETURN u +``` + +... it might just return the Chloe document, because the `LIMIT` is applied before +the second `FILTER`. No more than 5 documents arrive at the second `FILTER` block, +and not all of them fulfill the gender criterion, even though there are more than +5 active female users in the collection. A more deterministic result can be achieved +by adding a `SORT` block: + +```aql +FOR u IN users + FILTER u.active == true + SORT u.age ASC + LIMIT 5 + FILTER u.gender == "f" + RETURN u +``` + +This will return the users *Mariah*, *Mary*, and *Isabella*. If sorted by age in +`DESC` order, then the *Sophia* and *Emma* documents are returned. A `FILTER` after a +`LIMIT` is not very common however, and you probably want such a query instead: + +```aql +FOR u IN users + FILTER u.active == true AND u.gender == "f" + SORT u.age ASC + LIMIT 5 + RETURN u +``` + +The significance of where `FILTER` blocks are placed allows that this single +keyword can assume the roles of two SQL keywords, `WHERE` as well as `HAVING`. +AQL's `FILTER` thus works with `COLLECT` aggregates the same as with any other +intermediate result, document attribute etc. diff --git a/site/content/arangodb/oem/aql/high-level-operations/for.md b/site/content/arangodb/oem/aql/high-level-operations/for.md new file mode 100644 index 0000000000..6c80f9d921 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/for.md @@ -0,0 +1,251 @@ +--- +title: '`FOR` operation in AQL' +menuTitle: FOR +weight: 5 +description: >- + The versatile `FOR` operation can iterate over a collection or View, the + elements of an array, or traverse a graph +--- +## Syntax + +The general syntax for iterating over collections and arrays is: + +
FOR variableName IN expression
+ +There is also a special variant for [graph traversals](../graphs/traversals.md): + +
FOR vertexVariableName [, edgeVariableName [, pathVariableName ] ] IN traversalExpression
+ +For Views, there is a special (optional) [`SEARCH` keyword](search.md): + +
FOR variableName IN viewName SEARCH searchExpression
+ +{{< info >}} +Views cannot be used as edge collections in traversals: + +```aql +FOR v IN 1..3 ANY startVertex viewName /* invalid! */ +``` +{{< /info >}} + +All variants can optionally end with an `OPTIONS { … }` clause. + +## Usage + +Each array element returned by *expression* is visited exactly once. It is +required that *expression* returns an array in all cases. The empty array is +allowed, too. The current array element is made available for further processing +in the variable specified by *variableName*. + +```aql +FOR u IN users + RETURN u +``` + +This iterates over all elements of the array referred to as `users`. This array +consists of all documents stored in the collection named `users` in this case. +The `FOR` operation makes the current array element available in a variable `u`, +which is not modified in this example but simply returned as a result using the +`RETURN` operation. + +{{< info >}} +When iterating over a collection, the order of documents is undefined unless you +define an explicit sort order with a [`SORT` operation](sort.md). +{{< /info >}} + +The variable introduced by `FOR` is available until the scope the `FOR` is +placed in is closed. + +Another example that uses a statically declared array of values to iterate over: + +```aql +FOR year IN [ 2011, 2012, 2013 ] + RETURN { "year" : year, "isLeapYear" : year % 4 == 0 && (year % 100 != 0 || year % 400 == 0) } +``` + +Nesting of multiple `FOR` statements is allowed, too. When `FOR` statements are +nested, a cross product of the array elements returned by the individual `FOR` +statements will be created. + +```aql +FOR u IN users + FOR l IN locations + RETURN { "user" : u, "location" : l } +``` + +In this example, there are two array iterations: an outer iteration over the array +`users` plus an inner iteration over the array `locations`. The inner array is +traversed as many times as there are elements in the outer array. For each +iteration, the current values of `users` and `locations` are made available for +further processing in the variable `u` and `l`. + +You can also use subqueries, for example, to iterate over a collection +independently and get the results back as an array, that you can then access in +an outer `FOR` loop: + +```aql +FOR u IN users + LET subquery = (FOR l IN locations RETURN l.location) + RETURN { "user": u, "locations": subquery } +``` + +Also see [Combining queries with subqueries](../fundamentals/subqueries.md). + +## Options + +For collections and Views, the `FOR` construct supports an optional `OPTIONS` +clause to modify the behavior. The general syntax is as follows: + +
FOR variableName IN expression OPTIONS { option: value, ... }
+ +### `indexHint` + +For collections, index hints can be given to the optimizer with the `indexHint` +option. The value can be a single **index name** or a list of index names in +order of preference: + +```aql +FOR … IN … OPTIONS { indexHint: "byName" } +``` + +```aql +FOR … IN … OPTIONS { indexHint: ["byName", "byColor"] } +``` + +Whenever there is a chance to potentially use an index for this `FOR` loop, +the optimizer will first check if the specified index can be used. In case of +an array of indexes, the optimizer will check the feasibility of each index in +the specified order. It will use the first suitable index, regardless of +whether it would normally use a different index. + +If none of the specified indexes is suitable, then it falls back to its normal +logic to select another index or fails if `forceIndexHint` is enabled. + +### `forceIndexHint` + +Index hints are not enforced by default. If `forceIndexHint` is set to `true`, +then an error is generated if `indexHint` does not contain a usable index, +instead of using a fallback index or not using an index at all. + +```aql +FOR … IN … OPTIONS { indexHint: … , forceIndexHint: true } +``` + +### `disableIndex` + +Introduced in: v3.9.1 + +In some rare cases it can be beneficial to not do an index lookup or scan, +but to do a full collection scan. +An index lookup can be more expensive than a full collection scan if +the index lookup produces many (or even all documents) and the query cannot +be satisfied from the index data alone. + +Consider the following query and an index on the `value` attribute being +present: + +```aql +FOR doc IN collection + FILTER doc.value <= 99 + RETURN doc.other +``` + +In this case, the optimizer will likely pick the index on `value`, because +it will cover the query's `FILTER` condition. To return the value for the +`other` attribute, the query must additionally look up the documents for +each index value that passes the `FILTER` condition. If the number of +index entries is large (close or equal to the number of documents in the +collection), then using an index can cause more work than just scanning +over all documents in the collection. + +The optimizer will likely prefer index scans over full collection scans, +even if an index scan turns out to be slower in the end. +You can force the optimizer to not use an index for any given `FOR` +loop by using the `disableIndex` hint and setting it to `true`: + +```aql +FOR doc IN collection OPTIONS { disableIndex: true } + FILTER doc.value <= 99 + RETURN doc.other +``` + +Using `disableIndex: false` has no effect on geo indexes or fulltext indexes. + +Note that setting `disableIndex: true` plus `indexHint` is ambiguous. In +this case the optimizer will always prefer the `disableIndex` hint. + +### `maxProjections` + +Introduced in: v3.9.1 + +By default, the query optimizer will consider up to 5 document attributes +per FOR loop to be used as projections. If more than 5 attributes of a +collection are accessed in a `FOR` loop, the optimizer will prefer to +extract the full document and not use projections. + +The threshold value of 5 attributes is arbitrary and can be adjusted +by using the `maxProjections` hint. +The default value for `maxProjections` is `5`, which is compatible with the +previously hard-coded default value. + +For example, using a `maxProjections` hint of 7, the following query will +extract 7 attributes as projections from the original document: + +```aql +FOR doc IN collection OPTIONS { maxProjections: 7 } + RETURN [ doc.val1, doc.val2, doc.val3, doc.val4, doc.val5, doc.val6, doc.val7 ] +``` + +Normally it is not necessary to adjust the value of `maxProjections`, but +there are a few corner cases where it can make sense: + +- It can be beneficial to increase `maxProjections` when extracting many small + attributes from very large documents, and a full copy of the documents should + be avoided. +- It can be beneficial to decrease `maxProjections` to _avoid_ using + projections, if the cost of projections is higher than doing copies of the + full documents. This can be the case for very small documents. + +{{< info >}} +Starting with version 3.10, `maxProjections` can be used in +[Graph Traversals](../graphs/traversals.md#working-with-named-graphs) (Enterprise Edition only). +{{< /info >}} + +### `useCache` + +Introduced in: v3.10.0 + +You can disable in-memory caches that you may have enabled for persistent indexes +on a case-by-case basis. This is useful for queries that access indexes with +enabled in-memory caches, but for which it is known that using the cache will +have a negative performance impact. In this case, you can set the `useCache` +hint to `false`: + +```aql +FOR doc IN collection OPTIONS { useCache: false } + FILTER doc.value == @value + ... +``` + +You can set the hint individually per `FOR` loop. +If you do not set the `useCache` hint, it will implicitly default to `true`. + +The hint does not have any effect on `FOR` loops that do not use indexes, or +on `FOR` loops that access indexes that do not have in-memory caches enabled. +It also does not affect queries for which an existing in-memory +cache cannot be used (i.e. because the query's filter condition does not contain +equality lookups for all index attributes). It cannot be used for `FOR` +operations that iterate over Views or perform graph traversals. + +Also see [Caching of index values](../../index-and-search/indexing/working-with-indexes/persistent-indexes.md#caching-of-index-values). + +### `lookahead` + +The multi-dimensional index type `zkd` supports an optional index hint for +tweaking performance: + +```aql +FOR … IN … OPTIONS { lookahead: 32 } +``` + +See [Multi-dimensional indexes](../../index-and-search/indexing/working-with-indexes/multi-dimensional-indexes.md#lookahead-index-hint). diff --git a/site/content/arangodb/oem/aql/high-level-operations/insert.md b/site/content/arangodb/oem/aql/high-level-operations/insert.md new file mode 100644 index 0000000000..88acdfdf4f --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/insert.md @@ -0,0 +1,215 @@ +--- +title: '`INSERT` operation in AQL' +menuTitle: INSERT +weight: 65 +description: >- + You can use the `INSERT` operation to create new documents in a collection +--- +Each `INSERT` operation is restricted to a single collection, and the +[collection name](../../concepts/data-structure/collections.md#collection-names) must not be dynamic. +Only a single `INSERT` statement per collection is allowed per AQL query, and +it cannot be followed by read or write operations that access the same +collection, by traversal operations, or AQL functions that can read documents. + +## Syntax + +The syntax for an insert operation is: + +
INSERT document INTO collection
+ +It can optionally end with an `OPTIONS { … }` clause. + +{{< tip >}} +The `IN` keyword is allowed in place of `INTO` and has the same meaning. +{{< /tip >}} + +`collection` must contain the name of the collection into which the documents should +be inserted. `document` is the document to be inserted, and it may or may not contain +a `_key` attribute. If no `_key` attribute is provided, ArangoDB will auto-generate +a value for `_key` value. Inserting a document will also auto-generate a document +revision number for the document. + +```aql +FOR i IN 1..100 + INSERT { value: i } INTO numbers +``` + +An insert operation can also be performed without a `FOR` loop to insert a +single document: + +```aql +INSERT { value: 1 } INTO numbers +``` + +When inserting into an [edge collection](../../concepts/data-models.md#graph-model), +it is mandatory to specify the attributes `_from` and `_to` in document: + +```aql +FOR u IN users + FOR p IN products + FILTER u._key == p.recommendedBy + INSERT { _from: u._id, _to: p._id } INTO recommendations +``` + +## Query options + +The `OPTIONS` keyword followed by an object with query options can optionally +be provided in an `INSERT` operation. + +### `ignoreErrors` + +`ignoreErrors` can be used to suppress query errors that may occur when +violating unique key constraints: + +```aql +FOR i IN 1..1000 + INSERT { + _key: CONCAT('test', i), + name: "test", + foobar: true + } INTO users OPTIONS { ignoreErrors: true } +``` + +### `waitForSync` + +To make sure data are durable when an insert query returns, there is the +`waitForSync` query option: + +```aql +FOR i IN 1..1000 + INSERT { + _key: CONCAT('test', i), + name: "test", + foobar: true + } INTO users OPTIONS { waitForSync: true } +``` + +### `overwrite` + +{{< info >}} +The `overwrite` option is deprecated and superseded by +[overwriteMode](#overwritemode). +{{< /info >}} + +If you want to replace existing documents with documents having the same key +there is the `overwrite` query option. This will let you safely replace the +documents instead of raising a "unique constraint violated error": + +```aql +FOR i IN 1..1000 + INSERT { + _key: CONCAT('test', i), + name: "test", + foobar: true + } INTO users OPTIONS { overwrite: true } +``` + +### `overwriteMode` + +To further control the behavior of INSERT on primary index unique constraint +violations, there is the `overwriteMode` option. It offers the following +modes: + +- `"ignore"`: if a document with the specified `_key` value exists already, + nothing will be done and no write operation will be carried out. The + insert operation will return success in this case. This mode does not + support returning the old document version. Using `RETURN OLD` will trigger + a parse error, as there will be no old version to return. `RETURN NEW` + will only return the document in case it was inserted. In case the + document already existed, `RETURN NEW` will return `null`. +- `"replace"`: if a document with the specified `_key` value exists already, + it will be overwritten with the specified document value. This mode will + also be used when no overwrite mode is specified but the `overwrite` + flag is set to `true`. +- `"update"`: if a document with the specified `_key` value exists already, + it will be patched (partially updated) with the specified document value. +- `"conflict"`: if a document with the specified `_key` value exists already, + return a unique constraint violation error so that the insert operation + fails. This is also the default behavior in case the overwrite mode is + not set, and the `overwrite` flag is `false` or not set either. + +The main use case of inserting documents with overwrite mode `ignore` is +to make sure that certain documents exist in the cheapest possible way. +In case the target document already exists, the `ignore` mode is most +efficient, as it will not retrieve the existing document from storage and +not write any updates to it. + +When using the `update` overwrite mode, the `keepNull` and `mergeObjects` +options control how the update is done. +See [UPDATE operation](update.md#query-options). + +```aql +FOR i IN 1..1000 + INSERT { + _key: CONCAT('test', i), + name: "test", + foobar: true + } INTO users OPTIONS { overwriteMode: "update", keepNull: true, mergeObjects: false } +``` + +### `exclusive` + +The RocksDB engine does not require collection-level locks. +Different write operations on the same collection do not block each other, as +long as there are no _write-write conflicts_ on the same documents. From an application +development perspective it can be desired to have exclusive write access on collections, +to simplify the development. Note that writes do not block reads in RocksDB. +Exclusive access can also speed up modification queries, because we avoid conflict checks. + +Use the `exclusive` option to achieve this effect on a per query basis: + +```aql +FOR doc IN collection + INSERT { myval: doc.val + 1 } INTO users + OPTIONS { exclusive: true } +``` + +### `refillIndexCaches` + +Whether to add new entries to in-memory index caches if document insertions +affect the edge index or cache-enabled persistent indexes. + +```aql +INSERT { _from: "vert/A", _to: "vert/B" } INTO coll + OPTIONS { refillIndexCaches: true } +``` + +## Returning the inserted documents + +The inserted documents can also be returned by the query. In this case, the `INSERT` +statement can be a `RETURN` statement (intermediate `LET` statements are allowed, too). +To refer to the inserted documents, the `INSERT` statement introduces a pseudo-value +named `NEW`. + +The documents contained in `NEW` will contain all attributes, even those auto-generated by +the database (e.g. `_id`, `_key`, `_rev`). + +```aql +INSERT document INTO collection RETURN NEW +``` + +Following is an example using a variable named `inserted` to return the inserted +documents. For each inserted document, the document key is returned: + +```aql +FOR i IN 1..100 + INSERT { value: i } + INTO users + LET inserted = NEW + RETURN inserted._key +``` + +## Transactionality + +On a single server, an insert operation is executed transactionally in an +all-or-nothing fashion. + +A query may execute intermediate transaction commits in case the running +transaction (AQL query) hits the specified size thresholds. In this case, the +query's operations carried out so far are committed and not rolled back in case +of a later abort/rollback. This behavior can be controlled by adjusting the +intermediate commit settings for the RocksDB engine. See +[Known limitations for AQL queries](../fundamentals/limitations.md#storage-engine-properties). + +For sharded collections, the entire query and/or insert operation may not be +transactional, especially if it involves different shards and/or DB-Servers. diff --git a/site/content/arangodb/oem/aql/high-level-operations/let.md b/site/content/arangodb/oem/aql/high-level-operations/let.md new file mode 100644 index 0000000000..d8665ac121 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/let.md @@ -0,0 +1,69 @@ +--- +title: '`LET` operation in AQL' +menuTitle: LET +weight: 35 +description: >- + You can use the `LET` operation to assign an arbitrary value to a variable +--- +The variable is introduced in the scope the `LET` statement is placed in. +You cannot change the value once assigned. + +## Syntax + +
LET variableName = expression
+ +*expression* can be a simple expression or a subquery. + +For allowed variable names [AQL Syntax](../fundamentals/syntax.md#names). + +## Usage + +Variables are immutable in AQL, which means they cannot be re-assigned: + +```aql +LET a = [1, 2, 3] // initial assignment + +a = PUSH(a, 4) // syntax error, unexpected identifier +LET a = PUSH(a, 4) // parsing error, variable 'a' is assigned multiple times +LET b = PUSH(a, 4) // allowed, result: [1, 2, 3, 4] +``` + +`LET` statements are mostly used to declare complex computations and to avoid +repeated computations of the same value at multiple parts of a query. + +```aql +FOR u IN users + LET numRecommendations = LENGTH(u.recommendations) + RETURN { + "user" : u, + "numRecommendations" : numRecommendations, + "isPowerUser" : numRecommendations >= 10 + } +``` + +In the above example, the computation of the number of recommendations is +factored out using a `LET` statement, thus avoiding computing the value twice in +the `RETURN` statement. + +Another use case for `LET` is to declare a complex computation in a subquery, +making the whole query more readable. + +```aql +FOR u IN users + LET friends = ( + FOR f IN friends + FILTER u.id == f.userId + RETURN f + ) + LET memberships = ( + FOR m IN memberships + FILTER u.id == m.userId + RETURN m + ) + RETURN { + "user" : u, + "friends" : friends, + "numFriends" : LENGTH(friends), + "memberShips" : memberships + } +``` diff --git a/site/content/arangodb/oem/aql/high-level-operations/limit.md b/site/content/arangodb/oem/aql/high-level-operations/limit.md new file mode 100644 index 0000000000..c34ba21d02 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/limit.md @@ -0,0 +1,96 @@ +--- +title: '`LIMIT` operation in AQL' +menuTitle: LIMIT +weight: 30 +description: >- + The `LIMIT` operation allows you to reduce the number of results to at most + the specified number and optionally skip results using an offset for pagination +--- +## Syntax + +Two general forms of `LIMIT` are: + +
LIMIT count
+LIMIT offset, count
+ +The first form allows specifying only the `count` value whereas the second form +allows specifying both `offset` and `count`. The first form is identical using +the second form with an `offset` value of `0`. + +## Usage + +```aql +FOR u IN users + LIMIT 5 + RETURN u +``` + +Above query returns five documents of the `users` collection. +It could also be written as `LIMIT 0, 5` for the same result. +Which documents it returns is rather arbitrary because collections have no +defined order for the documents they contain. A `LIMIT` operation should usually +be accompanied with a `SORT` operation to explicitly specify a sorting order +unless any five documents are acceptable for you. However, also consider that if +you run a query multiple times with varying `LIMIT` offsets for pagination, +you can miss results or get duplicate results if the sort order is undefined. + +{{< info >}} +In case multiple documents contain the same `SORT` attribute value, the result +set does not contain the tied documents in a fixed order as the order between +them is undefined. Additionally, the `SORT` operation does not guarantee a stable +sort if there is no unique value to sort by. + +If a fixed total order is required, you can use a tiebreaker. Sort by an +additional attribute that can break the ties. If the application has a preferred +attribute that indicates the order of documents with the same value, then use +this attribute. If there is no such attribute, you can still achieve a stable +sort by using the `_id` system attribute as it is unique and present in every +document. + +```aql +FOR u IN users + SORT u.firstName, u._id // break name ties with the document ID + LIMIT 5 + RETURN u +``` +{{< /info >}} + +The `offset` value specifies how many elements from the result shall be +skipped. It must be 0 or greater. The `count` value specifies how many +elements should be at most included in the result. + +```aql +FOR u IN users + SORT u.firstName, u.lastName, u.id DESC + LIMIT 2, 5 + RETURN u +``` + +In above example, the documents of `users` are sorted, the first two results +get skipped, and the query returns the next five user documents. + +{{< info >}} +Variables, expressions, and subqueries cannot be used for `offset` and `count`. +The values for `offset` and `count` must be known at query compile time, +which means that you can only use number literals, bind parameters or +expressions that can be resolved at query compile time. +{{< /info >}} + +Where a `LIMIT` is used in relation to other operations in a query has meaning. +`LIMIT` operations before `FILTER`s in particular can change the result +significantly, because the operations are executed in the order in which they +are written in the query. See [FILTER](filter.md#order-of-operations) +for a detailed example. + +The `LIMIT` operation never applies to write operations (`INSERT`, `UPDATE`, +`REPLACE`, `REMOVE`, `UPSERT`) but only their returned results. In the following +example, five documents are created, regardless of the `LIMIT 2`. The `LIMIT` +operation only constrains the number of documents returned by the query (via +`RETURN`) to the first two: + +```aql +FOR i IN 1..5 + INSERT { value: i } INTO coll + LIMIT 2 + RETURN NEW +``` diff --git a/site/content/arangodb/oem/aql/high-level-operations/remove.md b/site/content/arangodb/oem/aql/high-level-operations/remove.md new file mode 100644 index 0000000000..7963c48b70 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/remove.md @@ -0,0 +1,185 @@ +--- +title: '`REMOVE` operation in AQL' +menuTitle: REMOVE +weight: 50 +description: >- + You can use the `REMOVE` operation to delete documents from a collection +--- +Each `REMOVE` operation is restricted to a single collection, and the +[collection name](../../concepts/data-structure/collections.md#collection-names) must not be dynamic. +Only a single `REMOVE` statement per collection is allowed per AQL query, and +it cannot be followed by read or write operations that access the same collection, by +traversal operations, or AQL functions that can read documents. + +## Syntax + +The syntax for a remove operation is: + +
REMOVE keyExpression IN collection
+ +It can optionally end with an `OPTIONS { … }` clause. + +`collection` must contain the name of the collection to remove the documents +from. `keyExpression` must be an expression that contains the document identification. +This can either be a string (which must then contain the +[document key](../../concepts/data-structure/documents/_index.md#document-keys)) or a +document, which must contain a `_key` attribute. + +The following queries are thus equivalent: + +```aql +FOR u IN users + REMOVE { _key: u._key } IN users +``` + +```aql +FOR u IN users + REMOVE u._key IN users +``` + +```aql +FOR u IN users + REMOVE u IN users +``` + +A remove operation can remove arbitrary documents, and the documents +do not need to be identical to the ones produced by a preceding `FOR` statement: + +```aql +FOR i IN 1..1000 + REMOVE { _key: CONCAT('test', i) } IN users +``` + +```aql +FOR u IN users + FILTER u.active == false + REMOVE { _key: u._key } IN backup +``` + +A single document can be removed as well, using a document key string or a +document with `_key` attribute: + +```aql +REMOVE 'john' IN users +``` + +```aql +LET doc = DOCUMENT('users/john') +REMOVE doc IN users +``` + +The restriction of a single remove operation per query and collection +applies. The following query causes an _access after data-modification_ +error because of the third remove operation: + +```aql +REMOVE 'john' IN users +REMOVE 'john' IN backups // OK, different collection +REMOVE 'mary' IN users // Error, users collection again +``` + +## Query options + +### `ignoreErrors` + +`ignoreErrors` can be used to suppress query errors that may occur when trying to +remove non-existing documents. For example, the following query will fail if one +of the to-be-deleted documents does not exist: + +```aql +FOR i IN 1..1000 + REMOVE { _key: CONCAT('test', i) } IN users +``` + +By specifying the `ignoreErrors` query option, these errors can be suppressed so +the query completes: + +```aql +FOR i IN 1..1000 + REMOVE { _key: CONCAT('test', i) } IN users OPTIONS { ignoreErrors: true } +``` + +### `waitForSync` + +To make sure data has been written to disk when a query returns, there is the `waitForSync` +query option: + +```aql +FOR i IN 1..1000 + REMOVE { _key: CONCAT('test', i) } IN users OPTIONS { waitForSync: true } +``` + +### `ignoreRevs` + +In order to not accidentally remove documents that have been updated since you last fetched +them, you can use the option `ignoreRevs` to either let ArangoDB compare the `_rev` values and +only succeed if they still match, or let ArangoDB ignore them (default): + +```aql +FOR i IN 1..1000 + REMOVE { _key: CONCAT('test', i), _rev: "1287623" } IN users OPTIONS { ignoreRevs: false } +``` + +### `exclusive` + +The RocksDB engine does not require collection-level locks. Different write +operations on the same collection do not block each other, as +long as there are no _write-write conflicts_ on the same documents. From an application +development perspective it can be desired to have exclusive write access on collections, +to simplify the development. Note that writes do not block reads in RocksDB. +Exclusive access can also speed up modification queries, because we avoid conflict checks. + +Use the `exclusive` option to achieve this effect on a per query basis: + +```aql +FOR doc IN collection + REPLACE doc._key + WITH { replaced: true } + OPTIONS { exclusive: true } +``` + +### `refillIndexCaches` + +Whether to delete existing entries from in-memory index caches and refill them +if document removals affect the edge index or cache-enabled persistent indexes. + +```aql +REMOVE { _key: "123" } IN edgeColl + OPTIONS { refillIndexCaches: true } +``` + +## Returning the removed documents + +The removed documents can also be returned by the query. In this case, the +`REMOVE` statement must be followed by a `RETURN` statement (intermediate `LET` +statements are allowed, too).`REMOVE` introduces the pseudo-value `OLD` to +refer to the removed documents: + +```aql +REMOVE keyExpression IN collection options RETURN OLD +``` + +Following is an example using a variable named `removed` for capturing the removed +documents. For each removed document, the document key will be returned. + +```aql +FOR u IN users + REMOVE u IN users + LET removed = OLD + RETURN removed._key +``` + +## Transactionality + +On a single server, the document removal is executed transactionally in an +all-or-nothing fashion. + +A query may execute intermediate transaction commits in case the running +transaction (AQL query) hits the specified size thresholds. In this case, the +query's operations carried out so far are committed and not rolled back in case +of a later abort/rollback. This behavior can be controlled by adjusting the +intermediate commit settings for the RocksDB engine. See +[Known limitations for AQL queries](../fundamentals/limitations.md#storage-engine-properties). + +For sharded collections, the entire query and/or remove operation may not be +transactional, especially if it involves different shards and/or DB-Servers. diff --git a/site/content/arangodb/oem/aql/high-level-operations/replace.md b/site/content/arangodb/oem/aql/high-level-operations/replace.md new file mode 100644 index 0000000000..b2aa732641 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/replace.md @@ -0,0 +1,306 @@ +--- +title: '`REPLACE` operation in AQL' +menuTitle: REPLACE +weight: 60 +description: >- + The `REPLACE` operation removes all attributes of a document and sets the + given attributes, excluding immutable system attributes +--- +Each `REPLACE` operation is restricted to a single collection, and the +[collection name](../../concepts/data-structure/collections.md#collection-names) must not be dynamic. +Only a single `REPLACE` statement per collection is allowed per AQL query, and +it cannot be followed by read or write operations that access the same collection, +by traversal operations, or AQL functions that can read documents. + +You cannot replace the `_id`, `_key`, and `_rev` system attributes, but you can +replace the `_from` and `_to` attributes. + +Replacing a document modifies the document's revision number (`_rev` attribute) +with a server-generated value. + +## Syntax + +The two syntaxes for a replace operation are: + +
REPLACE document IN collection
+REPLACE keyExpression WITH document IN collection
+ +Both variants can optionally end with an `OPTIONS { … }` clause. + +`collection` must contain the name of the collection in which the document +should be replaced. + +`document` must be an object and contain the attributes and values to set. +**All existing attributes** in the stored document **are removed** from it and +**only the provided attributes are set** (excluding the immutable `_id` and +`_key` attributes and the system-managed `_rev` attribute). This distinguishes +the `REPLACE` from the `UPDATE` operation, which only affects the attributes +you specify in the operation and doesn't change other attributes of the stored +document. + +### `REPLACE IN ` + +Using the first syntax, the `document` object must have a `_key` attribute with +the document key. The existing document with this key is replaced with the +attributes provided by the `document` object (except for the `_id`, `_key`, and +`_rev` system attributes). + +The following query replaces the document identified by the key `my_key` in the +`users` collection, only setting a `name` and a `status` attribute. The key is +passed via the `_key` attribute alongside other attributes: + +```aql +REPLACE { _key: "my_key", name: "Jon", status: "active" } IN users +``` + +The following query is invalid because the object does not contain a `_key` +attribute and thus it is not possible to determine the document to +be replaced: + +```aql +REPLACE { name: "Jon" } IN users +``` + +You can combine the `REPLACE` operation with a `FOR` loop to determine the +necessary key attributes, like shown below: + +```aql +FOR u IN users + REPLACE { _key: u._key, name: CONCAT(u.firstName, " ", u.lastName), status: u.status } IN users +``` + +Note that the `REPLACE` and `FOR` operations are independent of each other and +`u` does not automatically define a document for the `REPLACE` statement. +Thus, the following query is invalid: + +```aql +FOR u IN users + REPLACE { name: CONCAT(u.firstName, " ", u.lastName), status: u.status } IN users +``` + +### `REPLACE WITH IN ` + +Using the second syntax, the document to replace is defined by the +`keyExpression`. It can either be a string with the document key, an object +which contains a `_key` attribute with the document key, or an expression that +evaluates to either of these two. The existing document with this key is +replaced with the attributes provided by the `document` object (except for +the `_id`, `_key`, and `_rev` system attributes). + +The following query replaces the document identified by the key `my_key` in the +`users` collection, only setting a `name` and a `status` attribute. The key is +passed as a string in the `keyExpression`. The attributes to set are passed +separately as the `document` object: + +```aql +REPLACE "my_key" WITH { name: "Jon", status: "active" } IN users +``` + +The `document` object may contain a `_key` attribute, but it is ignored. + +You cannot define the document to replace using an `_id` attribute, nor pass a +document identifier as a string (like `"users/john"`). However, you can use +`PARSE_IDENTIFIER().key` as `keyExpression` to get the document key as a +string: + +```aql +LET key = PARSE_IDENTIFIER("users/john").key +REPLACE key WITH { ... } IN users +``` + +### Comparison of the syntaxes + +Both syntaxes of the `REPLACE` operation allow you to define the document to +modify and the attributes to set. The document to update is effectively +identified by a document key in combination with the specified collection. + +The `REPLACE` operation supports different ways of specifying the document key. +You can choose the syntax variant that is the most convenient for you. + +The following queries are equivalent: + +```aql +FOR u IN users + REPLACE u WITH { name: CONCAT(u.firstName, " ", u.lastName), status: u.status } IN users +``` + +```aql +FOR u IN users + REPLACE u._key WITH { name: CONCAT(u.firstName, " ", u.lastName), status: u.status } IN users +``` + +```aql +FOR u IN users + REPLACE { _key: u._key } WITH { name: CONCAT(u.firstName, " ", u.lastName), status: u.status } IN users +``` + +```aql +FOR u IN users + REPLACE { _key: u._key, name: CONCAT(u.firstName, " ", u.lastName), status: u.status } IN users +``` + +## Dynamic key expressions + +A `REPLACE` operation may replace arbitrary documents, using either of the two +syntaxes: + +```aql +FOR i IN 1..1000 + REPLACE { _key: CONCAT("test", i), name: "Paula", status: "active" } IN users +``` + +```aql +FOR i IN 1..1000 + REPLACE CONCAT("test", i) WITH { name: "Paula", status: "active" } IN users +``` + +## Target a different collection + +The documents a `REPLACE` operation modifies can be in a different collection +than the ones produced by a preceding `FOR` operation: + +```aql +FOR u IN users + FILTER u.active == false + REPLACE u WITH { status: "inactive", name: u.name } IN backup +``` + +Note how documents are read from the `users` collection but replaced in another +collection called `backup`. Both collections need to use matching document keys +for this to work. + +Although the `u` variable holds a whole document, it is only used to define the +target document. The `_key` attribute of the object is extracted and the target +document is solely defined by the document key string value and the specified +collection of the `REPLACE` operation (`backup`). There is no link to the +original collection (`users`). + +## Query options + +You can optionally set query options for the `REPLACE` operation: + +```aql +REPLACE ... IN users OPTIONS { ... } +``` + +### `ignoreErrors` + +You can use `ignoreErrors` to suppress query errors that may occur when trying to +replace non-existing documents or when violating unique key constraints: + +```aql +FOR i IN 1..1000 + REPLACE CONCAT("test", i) + WITH { foobar: true } IN users + OPTIONS { ignoreErrors: true } +``` + +You cannot modify the `_id`, `_key`, and `_rev` system attributes, but attempts +to change them are ignored and not considered errors. + +### `waitForSync` + +To make sure data are durable when a replace query returns, there is the `waitForSync` +query option: + +```aql +FOR i IN 1..1000 + REPLACE CONCAT("test", i) + WITH { foobar: true } IN users + OPTIONS { waitForSync: true } +``` + +### `ignoreRevs` + +In order to not accidentally overwrite documents that have been modified since you last fetched +them, you can use the option `ignoreRevs` to either let ArangoDB compare the `_rev` value and only +succeed if they still match, or let ArangoDB ignore them (default): + +```aql +FOR i IN 1..1000 + REPLACE { _key: CONCAT("test", i), _rev: "1287623" } + WITH { foobar: true } IN users + OPTIONS { ignoreRevs: false } +``` + +### `exclusive` + +The RocksDB engine does not require collection-level locks. Different write +operations on the same collection do not block each other, as +long as there are no _write-write conflicts_ on the same documents. From an application +development perspective it can be desired to have exclusive write access on collections, +to simplify the development. Note that writes do not block reads in RocksDB. +Exclusive access can also speed up modification queries, because we avoid conflict checks. + +Use the `exclusive` option to achieve this effect on a per query basis: + +```aql +FOR doc IN collection + REPLACE doc + WITH { replaced: true } IN collection + OPTIONS { exclusive: true } +``` + +### `refillIndexCaches` + +Whether to update existing entries in in-memory index caches if documents +replacements affect the edge index or cache-enabled persistent indexes. + +```aql +REPLACE { _key: "123", _from: "vert/C", _to: "vert/D" } IN edgeColl + OPTIONS { refillIndexCaches: true } +``` + +## Returning the modified documents + +You can optionally return the documents modified by the query. In this case, the `REPLACE` +operation needs to be followed by a `RETURN` operation. Intermediate `LET` operations are +allowed, too. These operations can refer to the pseudo-variables `OLD` and `NEW`. +The `OLD` pseudo-variable refers to the document revisions before the replace, and `NEW` +refers to the document revisions after the replace. + +Both `OLD` and `NEW` contain all document attributes, even those not specified +in the replace expression. + +```aql +REPLACE document IN collection options RETURN OLD +REPLACE document IN collection options RETURN NEW +REPLACE keyExpression WITH document IN collection options RETURN OLD +REPLACE keyExpression WITH document IN collection options RETURN NEW +``` + +Following is an example using a variable named `previous` to return the original +documents before modification. For each replaced document, the document key is +returned: + +```aql +FOR u IN users + REPLACE u WITH { value: "test" } IN users + LET previous = OLD + RETURN previous._key +``` + +The following query uses the `NEW` pseudo-value to return the replaced +documents, without some of their system attributes: + +```aql +FOR u IN users + REPLACE u WITH { value: "test" } IN users + LET replaced = NEW + RETURN UNSET(replaced, "_key", "_id", "_rev") +``` + +## Transactionality + +On a single server, replace operations are executed transactionally in an +all-or-nothing fashion. + +A query may execute intermediate transaction commits in case the running +transaction (AQL query) hits the specified size thresholds. In this case, the +query's operations carried out so far are committed and not rolled back in case +of a later abort/rollback. This behavior can be controlled by adjusting the +intermediate commit settings for the RocksDB engine. See +[Known limitations for AQL queries](../fundamentals/limitations.md#storage-engine-properties). + +For sharded collections, the entire query and/or replace operation may not be +transactional, especially if it involves different shards and/or DB-Servers. diff --git a/site/content/arangodb/oem/aql/high-level-operations/return.md b/site/content/arangodb/oem/aql/high-level-operations/return.md new file mode 100644 index 0000000000..c4344e0865 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/return.md @@ -0,0 +1,212 @@ +--- +title: '`RETURN` operation in AQL' +menuTitle: RETURN +weight: 10 +description: >- + You can use the `RETURN` operation to produce the result of a query +--- +A `RETURN` operation is mandatory at the end of each block in a data access query, +otherwise the query result would be undefined. Using `RETURN` at the top level +in data modification queries is optional. + +## Syntax + +The general syntax for `RETURN` is: + +
RETURN expression
+ +There is also a variant [`RETURN DISTINCT`](#return-distinct). + +The *expression* returned by `RETURN` is produced for each iteration in the block the +`RETURN` statement is placed in. That means the result of a `RETURN` statement +is **always an array**. This includes an empty array if no documents matched the +query and a single return value returned as array with one element. + +To return all elements from the currently iterated array without modification, +the following simple form can be used: + +
FOR variableName IN expression
+  RETURN variableName
+ +As `RETURN` allows specifying an expression, arbitrary computations can be +performed to calculate the result elements. Any of the variables valid in the +scope the `RETURN` is placed in can be used for the computations. + +## Usage + +To iterate over all documents of a collection called *users* and return the +full documents, you can write: + +```aql +FOR u IN users + RETURN u +``` + +In each iteration of the for-loop, a document of the *users* collection is +assigned to a variable *u* and returned unmodified in this example. To return +only one attribute of each document, you could use a different return expression: + +```aql +FOR u IN users + RETURN u.name +``` + +Or to return multiple attributes, an object can be constructed like this: + +```aql +FOR u IN users + RETURN { name: u.name, age: u.age } +``` + +Note: `RETURN` will close the current scope and eliminate all local variables in it. +This is important to remember when working with [subqueries](../fundamentals/subqueries.md). + +[Dynamic attribute names](../fundamentals/data-types.md#objects--documents) are +supported as well: + +```aql +FOR u IN users + RETURN { [ u._id ]: u.age } +``` + +The document *_id* of every user is used as expression to compute the +attribute key in this example: + +```json +[ + { + "users/9883": 32 + }, + { + "users/9915": 27 + }, + { + "users/10074": 69 + } +] +``` + +The result contains one object per user with a single key/value pair each. +This is usually not desired. For a single object, that maps user IDs to ages, +the individual results need to be merged and returned with another `RETURN`: + +```aql +RETURN MERGE( + FOR u IN users + RETURN { [ u._id ]: u.age } +) +``` + +```json +[ + { + "users/10074": 69, + "users/9883": 32, + "users/9915": 27 + } +] +``` + +Keep in mind that if the key expression evaluates to the same value multiple +times, only one of the key/value pairs with the duplicate name will survive +[`MERGE()`](../functions/document-object.md#merge). To avoid this, you can go without +dynamic attribute names, use static names instead and return all document +properties as attribute values: + +```aql +FOR u IN users + RETURN { name: u.name, age: u.age } +``` + +```json +[ + { + "name": "John Smith", + "age": 32 + }, + { + "name": "James Hendrix", + "age": 69 + }, + { + "name": "Katie Foster", + "age": 27 + } +] +``` + +## `RETURN DISTINCT` + +`RETURN` can optionally be followed by the `DISTINCT` keyword. +The `DISTINCT` keyword will ensure uniqueness of the values returned by the +`RETURN` statement: + +
FOR variableName IN expression
+  RETURN DISTINCT expression
+ +`RETURN DISTINCT` is not allowed on the top-level of a query if there is no `FOR` +loop preceding it. + +Below example returns `["foo", "bar", "baz"]`: + +```aql +FOR value IN ["foo", "bar", "bar", "baz", "foo"] + RETURN DISTINCT value +``` + +{{< tip >}} +`RETURN DISTINCT` will not change the order of the results it is applied on, +unlike [`COLLECT`](collect.md#collect-vs-return-distinct). +{{< /tip >}} + +If the `DISTINCT` is applied on an expression that itself is an array or a subquery, +the `DISTINCT` will not make the values in each array or subquery result unique, but instead +ensure that the result contains only distinct arrays or subquery results. To make +the result of an array or a subquery unique, simply apply the `DISTINCT` for the +array or the subquery. + +For example, the following query will apply `DISTINCT` on its subquery results, +but not inside the subquery: + +```aql +FOR what IN 1..2 + RETURN DISTINCT ( + FOR i IN [ 1, 2, 3, 4, 1, 3 ] + RETURN i + ) +``` + +Here we will have a `FOR` loop with two iterations that each execute a subquery. The +`DISTINCT` here is applied on the two subquery results. Both subqueries return the +same result value (that is `[ 1, 2, 3, 4, 1, 3 ]`), so after `DISTINCT` there will +only be one occurrence of the value `[ 1, 2, 3, 4, 1, 3 ]` left: + +```json +[ + [ 1, 2, 3, 4, 1, 3 ] +] +``` + +If the goal is to apply the `DISTINCT` inside the subquery, it needs to be moved +there: + +```aql +FOR what IN 1..2 + LET sub = ( + FOR i IN [ 1, 2, 3, 4, 1, 3 ] + RETURN DISTINCT i + ) + RETURN sub +``` + +In the above case, the `DISTINCT` will make the subquery results unique, so that +each subquery will return a unique array of values (`[ 1, 2, 3, 4 ]`). As the subquery +is executed twice and there is no `DISTINCT` on the top-level, that array will be +returned twice: + +```json +[ + [ 1, 2, 3, 4 ], + [ 1, 2, 3, 4 ] +] +``` diff --git a/site/content/arangodb/oem/aql/high-level-operations/search.md b/site/content/arangodb/oem/aql/high-level-operations/search.md new file mode 100644 index 0000000000..c0a3084152 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/search.md @@ -0,0 +1,337 @@ +--- +title: '`SEARCH` operation in AQL' +menuTitle: SEARCH +weight: 20 +description: >- + The `SEARCH` operation lets you filter Views, accelerated by the underlying + indexes +--- +The `SEARCH` operation guarantees to use View indexes for an efficient +execution plan. If you use the `FILTER` keyword for Views, no indexes are +utilized and the filtering is performed as a post-processing step. + +Conceptually, a View is just another document data source, +similar to an array or a document/edge collection, over which you can iterate +using a [`FOR` operation](for.md) in AQL: + +```aql +FOR doc IN viewName + RETURN doc +``` + +The optional `SEARCH` operation provides the capabilities to: + +- filter documents based on AQL Boolean expressions and functions +- match documents located in different collections backed by a fast index +- sort the result set based on how closely each document matched the + search conditions + +See [`arangosearch` Views](../../index-and-search/arangosearch/arangosearch-views-reference.md) and +[`search-alias` Views](../../index-and-search/arangosearch/search-alias-views-reference.md) on how to set up Views. + +## Syntax + +The `SEARCH` keyword is followed by an ArangoSearch filter expressions, which +is mostly comprised of calls to ArangoSearch AQL functions. + +
FOR doc IN viewName
+  SEARCH expression
+  OPTIONS { … }
+  ...
+ +## Usage + +The `SEARCH` statement, in contrast to `FILTER`, is treated as a part of the +`FOR` operation, not as an individual statement. It cannot be placed freely +in a query nor multiple times in the body of a `FOR` loop. `FOR ... IN` must be +followed by the name of a View, not a collection. The `SEARCH` operation has to +follow next, other operations before `SEARCH` such as `FILTER`, `COLLECT` etc. +are not allowed in this position. Subsequent operations are possible after +`SEARCH` and the expression however, including `SORT` to order the search +results based on a ranking value computed by the View. + +*expression* must be an ArangoSearch expression. The full power of ArangoSearch +is harnessed and exposed via special [ArangoSearch functions](../functions/arangosearch.md), +during both the search and sort stages. On top of that, common AQL operators +are supported. + +Note that inline expressions and a few other things are not supported by +`SEARCH`. The server will raise a query error in case of an invalid expression. + +The `OPTIONS` keyword and an object can optionally follow the search expression +to set [Search Options](#search-options). + +### Logical operators + +Logical or Boolean operators allow you to combine multiple search conditions. + +- `AND`, `&&` (conjunction) +- `OR`, `||` (disjunction) +- `NOT`, `!` (negation / inversion) + +[Operator precedence](../operators.md#operator-precedence) needs to be taken +into account and can be controlled with parentheses. + +Consider the following contrived expression: + +`doc.value < 0 OR doc.value > 5 AND doc.value IN [-10, 10]` + +`AND` has a higher precedence than `OR`. The expression is equivalent to: + +`doc.value < 0 OR (doc.value > 5 AND doc.value IN [-10, 10])` + +The conditions are thus: +- values less than 0 +- values greater than 5, but only if it is 10 + (or -10, but this can never be fulfilled) + +Parentheses can be used as follows to apply the `AND` condition to both of the +`OR` conditions: + +`(doc.value < 0 OR doc.value > 5) AND doc.value IN [-10, 10]` + +The conditions are now: +- values less than 0, but only if it is -10 +- values greater than 5, but only if it is 10 + +### Comparison operators + +- `==` (equal) +- `<=` (less than or equal) +- `>=` (greater than or equal) +- `<` (less than) +- `>` (greater than) +- `!=` (unequal) +- `IN` (contained in array or range), also `NOT IN` +- `LIKE` (equal with wildcards), also `NOT LIKE` + +Also see the [`IN_RANGE()` function](../functions/arangosearch.md#in_range) for +an alternative to a combination of `<`, `<=`, `>`, `>=` operators for range +searches. + +```aql +FOR doc IN viewName + SEARCH ANALYZER(doc.text == "quick" OR doc.text == "brown", "text_en") + // -- or -- + SEARCH ANALYZER(doc.text IN ["quick", "brown"], "text_en") + RETURN doc +``` + +{{< warning >}} +The alphabetical order of characters is not taken into account by ArangoSearch, +i.e. range queries in SEARCH operations against Views will not follow the +language rules as per the defined Analyzer locale (except for the +[`collation` Analyzer](../../index-and-search/analyzers.md#collation)) nor the server language +(startup option `--default-language`)! +Also see [Known Issues](../../release-notes/version-3.11/known-issues-in-3-11.md#arangosearch). +{{< /warning >}} + +### Array comparison operators + +[Array comparison operators](../operators.md#array-comparison-operators) are +supported: + +```aql +LET tokens = TOKENS("some input", "text_en") // ["some", "input"] +FOR doc IN myView SEARCH tokens ALL IN doc.text RETURN doc // dynamic conjunction +FOR doc IN myView SEARCH tokens ANY IN doc.text RETURN doc // dynamic disjunction +FOR doc IN myView SEARCH tokens NONE IN doc.text RETURN doc // dynamic negation +FOR doc IN myView SEARCH tokens ALL > doc.text RETURN doc // dynamic conjunction with comparison +FOR doc IN myView SEARCH tokens ANY <= doc.text RETURN doc // dynamic disjunction with comparison +FOR doc IN myView SEARCH tokens NONE < doc.text RETURN doc // dynamic negation with comparison +FOR doc IN myView SEARCH tokens AT LEAST (1+1) IN doc.text RETURN doc // dynamically test for a subset of elements +``` + +The following operators are equivalent in `SEARCH` expressions: +- `ALL IN`, `ALL ==`, `NONE !=`, `NONE NOT IN` +- `ANY IN`, `ANY ==` +- `NONE IN`, `NONE ==`, `ALL !=`, `ALL NOT IN` +- `ALL >`, `NONE <=` +- `ALL >=`, `NONE <` +- `ALL <`, `NONE >=` +- `ALL <=`, `NONE >` +- `AT LEAST (...) IN`, `AT LEAST (...) ==` +- `AT LEAST (1) IN`, `ANY IN` + +The stored attribute referenced on the right side of the operator is like a +single, primitive value. In case of multiple tokens, it is like having multiple +such values as opposed to an array of values, even if the actual document +attribute is an array. `IN` and `==` as part of array comparison operators are +treated the same in `SEARCH` expressions for ease of use. The behavior is +different outside of `SEARCH`, where `IN` needs to be followed by an array. + +### Question mark operator + +You can use the [Question mark operator](../operators.md#question-mark-operator) +to perform [Nested searches with ArangoSearch](../../index-and-search/arangosearch/nested-search.md) +(Enterprise Edition only): + +```aql +FOR doc IN myView + SEARCH doc.dimensions[? FILTER CURRENT.type == "height" AND CURRENT.value > 40] + RETURN doc +``` + +It allows you to match nested objects in arrays that satisfy multiple conditions +each, and optionally define how often these conditions should be fulfilled for +the entire array. You need to configure the View specifically for this type of +search using the `nested` property in [`arangosearch` Views](../../index-and-search/arangosearch/arangosearch-views-reference.md#link-properties) +or in the definition of [Inverted Indexes](../../index-and-search/indexing/working-with-indexes/inverted-indexes.md#nested-search-enterprise-edition) +that you can add to [`search-alias` Views](../../index-and-search/arangosearch/search-alias-views-reference.md). + +## Handling of non-indexed fields + +Document attributes which are not configured to be indexed by a View are +treated by `SEARCH` as non-existent. This affects tests against the documents +emitted from the View only. + +For example, given a collection `myCol` with the following documents: + +```js +{ "someAttr": "One", "anotherAttr": "One" } +{ "someAttr": "Two", "anotherAttr": "Two" } +``` + +… with an `arangosearch` View where `someAttr` is indexed by the following View `myView`: + +```js +{ + "type": "arangosearch", + "links": { + "myCol": { + "fields": { + "someAttr": {} + } + } + } +} +``` + +… a search on `someAttr` yields the following result: + +```aql +FOR doc IN myView + SEARCH doc.someAttr == "One" + RETURN doc +``` + +```json +[ { "someAttr": "One", "anotherAttr": "One" } ] +``` + +A search on `anotherAttr` yields an empty result because only `someAttr` +is indexed by the View: + +```aql +FOR doc IN myView + SEARCH doc.anotherAttr == "One" + RETURN doc +``` + +```json +[] +``` + +You can use the special `includeAllFields` +[`arangosearch` View property](../../index-and-search/arangosearch/arangosearch-views-reference.md#link-properties) +to index all (sub-)attributes of the source documents if desired. + +## `SEARCH` with `SORT` + +The documents emitted from a View can be sorted by attribute values with the +standard [`SORT()` operation](sort.md), using one or multiple +attributes, in ascending or descending order (or a mix thereof). + +```aql +FOR doc IN viewName + SORT doc.text, doc.value DESC + RETURN doc +``` + +If the (left-most) fields and their sorting directions match up with the +[primary sort order](../../index-and-search/arangosearch/performance.md#primary-sort-order) definition +of the View then the `SORT` operation is optimized away. + +Apart from simple sorting, it is possible to sort the matched View documents by +relevance score (or a combination of score and attribute values if desired). +The document search via the `SEARCH` keyword and the sorting via the +[ArangoSearch Scoring Functions](../functions/arangosearch.md#scoring-functions), +namely `BM25()` and `TFIDF()`, are closely intertwined. +The query given in the `SEARCH` expression is not only used to filter documents, +but also is used with the scoring functions to decide which document matches +the query best. Other documents in the View also affect this decision. + +Therefore the ArangoSearch scoring functions can work _only_ on documents +emitted from a View, as both the corresponding `SEARCH` expression and the View +itself are consulted in order to sort the results. + +```aql +FOR doc IN viewName + SEARCH ... + SORT BM25(doc) DESC + RETURN doc +``` + +The [`BOOST()` function](../functions/arangosearch.md#boost) can be used to +fine-tune the resulting ranking by weighing sub-expressions in `SEARCH` +differently. + +If there is no `SEARCH` operation prior to calls to scoring functions or if +the search expression does not filter out documents (e.g. `SEARCH true`) then +a score of `0` will be returned for all documents. + +## Search Options + +The `SEARCH` operation supports an optional `OPTIONS` clause to modify the +behavior. The general syntax is as follows: + +
SEARCH expression OPTIONS { option: value, ... }
+ +### `collections` + +You can specify an array of strings with collection names to restrict the search +to certain source collections. + +Given a View with three linked collections `coll1`, `coll2`, and `coll3`, you +can return documents from the first two collections only and ignore the third +collection by setting the `collections` option to `["coll1", "coll2"]`: + +```aql +FOR doc IN viewName + SEARCH true OPTIONS { collections: ["coll1", "coll2"] } + RETURN doc +``` + +The search expression `true` in the above example matches all View documents. +You can use any valid expression here while limiting the scope to the chosen +source collections. + +### `conditionOptimization` + +You can specify one of the following values for this option to control how +search criteria get optimized: + +- `"auto"` (default): convert conditions to disjunctive normal form (DNF) and + apply optimizations. Removes redundant or overlapping conditions, but can + take quite some time even for a low number of nested conditions. +- `"none"`: search the index without optimizing the conditions. + + +See [Optimizing View and inverted index query performance](../../index-and-search/arangosearch/performance.md#condition-optimization-options) +for an example. + +### `countApproximate` + +This option controls how the total count of rows is calculated if the `fullCount` +option is enabled for a query or when a `COLLECT WITH COUNT` clause is executed. +You can set it to one of the following values: + +- `"exact"` (default): rows are actually enumerated for a precise count. +- `"cost"`: a cost-based approximation is used. Does not enumerate rows and + returns an approximate result with O(1) complexity. Gives a precise result + if the `SEARCH` condition is empty or if it contains a single term query + only (e.g. `SEARCH doc.field == "value"`), the usual eventual consistency + of Views aside. + +See [Optimizing View and inverted index query performance](../../index-and-search/arangosearch/performance.md#count-approximation) +for an example. diff --git a/site/content/arangodb/oem/aql/high-level-operations/sort.md b/site/content/arangodb/oem/aql/high-level-operations/sort.md new file mode 100644 index 0000000000..2c99b2e8ef --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/sort.md @@ -0,0 +1,109 @@ +--- +title: '`SORT` operation in AQL' +menuTitle: SORT +weight: 25 +description: >- + The `SORT` operation allows you to specify one or multiple sort criteria and + directions to control the order of query results or the elements of arrays +--- +## Syntax + +The general syntax is: + +
SORT expression direction
+ +## Usage + +The `SORT` operation sorts the already produced intermediate results of the +current block. For example, the following query sorts by `lastName` +(in ascending order), then `firstName` (in ascending order), then by `id` +(in descending order): + +```aql +FOR u IN users + SORT u.lastName, u.firstName, u.id DESC + RETURN u +``` + +Specifying the *direction* is optional. The default (implicit) direction for a +sort expression is the ascending order. To explicitly specify the sort direction, +the keywords `ASC` (ascending) and `DESC` (descending) can be used. Multiple sort +criteria can be separated using commas. In this case, the direction is specified +for each expression separately. + +The following example first sorts documents by `lastName` in ascending order and +then by `firstName` in ascending order. + +```aql +SORT doc.lastName, doc.firstName +``` + +The following example first sorts documents by `lastName` in descending order +and then by `firstName` in ascending order. + +```aql +SORT doc.lastName DESC, doc.firstName +``` + +The following example first sorts documents by `lastName` in ascending order +and then by `firstName` in descending order. + +```aql +SORT doc.lastName, doc.firstName DESC +``` + +{{< warning >}} +When iterating over a collection, the order of documents is always +**undefined unless an explicit sort order is defined** with a `SORT` operation. + +If the values you sort by are not unique, the order among tied documents is +undefined and you may want to sort by another attribute to break ties. +If the application has a preferred attribute that indicates the order of +documents with the same value, then use this attribute. If there is no such +attribute, you can still achieve a stable sort by using the `_id` system attribute +as it is unique and present in every document. + +```aql +FOR u IN users + SORT u.firstName, u._id // break name ties with the document ID + RETURN u +``` +{{< /warning >}} + +Constant `SORT` expressions can be used to indicate that no particular +sort order is desired. + +```aql +SORT null +``` + +Constant `SORT` expressions are optimized away by the AQL +optimizer during optimization, but specifying them explicitly may enable further +optimizations if the optimizer does not need to take into account any particular +sort order. This is especially the case after a `COLLECT` statement, which is +supposed to produce a sorted result. Specifying an extra `SORT null` after the +`COLLECT` statement allows to AQL optimizer to remove the post-sorting of the +collect results altogether. Also see [`COLLECT` option `method`](collect.md#method). + +In case of a sequence of `SORT` operations, the last one is always the one +that is performed unless a previous `SORT` expression is more accurate. +If the optimization rules `remove-redundant-sorts` and `remove-redundant-sorts-2` +are deactivated in the query's execution, then the last `SORT` is always the one +that wins, despite the accuracy. For example, consider the following query with +multiple consecutive `SORT` operations: + +```aql +FOR friend IN friends + SORT friend.friend.name, friend.id, friend.age + SORT friend.age, friend.id + SORT friend.age + RETURN friend +``` + +If the optimization rules mentioned above are deactivated, then the last `SORT` +becomes operative and the collection is sorted by `friend.age`. If the +optimization rules are active, then the second `SORT` becomes operative because +it covers the same `friend.age` attribute and additionally sorts by another +attribute in case of ties, making it more accurate. However, if the attributes +in the second `SORT` expression are in opposite order, as in +`SORT friend.id, friend.age`, then the last `SORT` is operative. diff --git a/site/content/arangodb/oem/aql/high-level-operations/update.md b/site/content/arangodb/oem/aql/high-level-operations/update.md new file mode 100644 index 0000000000..0a7ede0857 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/update.md @@ -0,0 +1,429 @@ +--- +title: '`UPDATE` operation in AQL' +menuTitle: UPDATE +weight: 55 +description: >- + The `UPDATE` operation partially modifies a document with the given attributes, + by adding new and updating existing attributes +--- +Each `UPDATE` operation is restricted to a single collection, and the +[collection name](../../concepts/data-structure/collections.md#collection-names) must not be dynamic. +Only a single `UPDATE` statement per collection is allowed per AQL query, and +it cannot be followed by read or write operations that access the same collection, +by traversal operations, or AQL functions that can read documents. + +You cannot update the `_id`, `_key`, and `_rev` system attributes, but you can +update the `_from` and `_to` attributes. + +Updating a document modifies the document's revision number (`_rev` attribute) +with a server-generated value. + +## Syntax + +The two syntaxes for an update operation are: + +
UPDATE document IN collection
+UPDATE keyExpression WITH document IN collection
+ +Both variants can optionally end with an `OPTIONS { … }` clause. + +`collection` must contain the name of the collection in which the document +should be updated. + +`document` must be an object and contain the attributes and values to update. +**Attributes that don't yet exist** in the stored document **are added** to it. +**Existing attributes are set to the provided attribute values** (excluding the +immutable `_id` and `_key` attributes and the system-managed `_rev` attribute). +The operation leaves other existing attributes not specified in `document` untouched. +This distinguishes the `UPDATE` from the `REPLACE` operation, which affects all +attributes of the stored document and not only the attributes you specify in the +operation. + +Sub-attributes are recursively merged by default, but you can let top-level +attributes replace existing ones by disabling the [`mergeObjects` option](#mergeobjects). + +### `UPDATE IN ` + +Using the first syntax, the `document` object must have a `_key` attribute with +the document key. The existing document with this key is updated with the +attributes provided by the `document` object (except for the `_id`, `_key`, and +`_rev` system attributes). + +The following query adds or updates the `name` attribute of the document +identified by the key `my_key` in the `users` collection. The key is passed via +the `_key` attribute alongside other attributes: + +```aql +UPDATE { _key: "my_key", name: "Jon" } IN users +``` + +The following query is invalid because the object does not contain a `_key` +attribute and thus it is not possible to determine the document to +be updated: + +```aql +UPDATE { name: "Jon" } IN users +``` + +You can combine the `UPDATE` operation with a `FOR` loop to determine the +necessary key attributes, like shown below: + +```aql +FOR u IN users + UPDATE { _key: u._key, name: CONCAT(u.firstName, " ", u.lastName) } IN users +``` + +Note that the `UPDATE` and `FOR` operations are independent of each other and +`u` does not automatically define a document for the `UPDATE` statement. +Thus, the following query is invalid: + +```aql +FOR u IN users + UPDATE { name: CONCAT(u.firstName, " ", u.lastName) } IN users +``` + +### `UPDATE WITH IN ` + +Using the second syntax, the document to update is defined by the +`keyExpression`. It can either be a string with the document key, an object +which contains a `_key` attribute with the document key, or an expression that +evaluates to either of these two. The existing document with this key is +updated with the attributes provided by the `document` object (except for +the `_id`, `_key`, and `_rev` system attributes). + +The following query adds or updates the `name` attribute of the document +identified by the key `my_key` in the `users` collection. The key is passed as +a string in the `keyExpression`. The attributes to add or update are passed +separately as the `document` object: + +```aql +UPDATE "my_key" WITH { name: "Jon" } IN users +``` + +The `document` object may contain a `_key` attribute, but it is ignored. + +You cannot define the document to update using an `_id` attribute, nor pass a +document identifier as a string (like `"users/john"`). However, you can use +`PARSE_IDENTIFIER().key` as `keyExpression` to get the document key as a +string: + +```aql +LET key = PARSE_IDENTIFIER("users/john").key +UPDATE key WITH { ... } IN users +``` + +### Comparison of the syntaxes + +Both syntaxes of the `UPDATE` operation allow you to define the document to +modify and the attributes to add or update. The document to update is effectively +identified by a document key in combination with the specified collection. + +The `UPDATE` operation supports different ways of specifying the document key. +You can choose the syntax variant that is the most convenient for you. + +The following queries are equivalent: + +```aql +FOR u IN users + UPDATE u WITH { name: CONCAT(u.firstName, " ", u.lastName) } IN users +``` + +```aql +FOR u IN users + UPDATE u._key WITH { name: CONCAT(u.firstName, " ", u.lastName) } IN users +``` + +```aql +FOR u IN users + UPDATE { _key: u._key } WITH { name: CONCAT(u.firstName, " ", u.lastName) } IN users +``` + +```aql +FOR u IN users + UPDATE { _key: u._key, name: CONCAT(u.firstName, " ", u.lastName) } IN users +``` + +## Dynamic key expressions + +An `UPDATE` operation may update arbitrary documents, using either of the two +syntaxes: + +```aql +FOR i IN 1..1000 + UPDATE { _key: CONCAT("test", i), name: "Paula" } IN users +``` + +```aql +FOR i IN 1..1000 + UPDATE CONCAT("test", i) WITH { name: "Paula" } IN users +``` + +## Target a different collection + +The documents an `UPDATE` operation modifies can be in a different collection +than the ones produced by a preceding `FOR` operation: + +```aql +FOR u IN users + FILTER u.active == false + UPDATE u WITH { status: "inactive" } IN backup +``` + +Note how documents are read from the `users` collection but updated in another +collection called `backup`. Both collections need to use matching document keys +for this to work. + +Although the `u` variable holds a whole document, it is only used to define the +target document. The `_key` attribute of the object is extracted and the target +document is solely defined by the document key string value and the specified +collection of the `UPDATE` operation (`backup`). There is no link to the +original collection (`users`). + +## Using the current value of a document attribute + +The pseudo-variable `OLD` is not supported inside of `WITH` clauses (it is +available after `UPDATE`). To access the current attribute value, you can +usually refer to a document via the variable of the `FOR` loop, which is used +to iterate over a collection: + +```aql +FOR doc IN users + UPDATE doc WITH { + fullName: CONCAT(doc.firstName, " ", doc.lastName) + } IN users +``` + +If there is no loop, because a single document is updated only, then there +might not be a variable like above (`doc`), which would let you refer to the +document which is being updated: + +```aql +UPDATE "john" WITH { ... } IN users +``` + +To access the current value in this situation, you need to retrieve the document +first and store it in a variable: + +```aql +LET doc = FIRST(FOR u IN users FILTER u._key == "john" RETURN u) +UPDATE doc WITH { + fullName: CONCAT(doc.firstName, " ", doc.lastName) +} IN users +``` + +You can modify an existing attribute based on its current value this way, +to increment a counter for instance: + +```aql +UPDATE doc WITH { + karma: doc.karma + 1 +} IN users +``` + +If the attribute `karma` doesn't exist yet, `doc.karma` evaluates to `null`. +The expression `null + 1` results in the new attribute `karma` being set to `1`. +If the attribute does exist, then it is increased by `1`. + +Arrays can be mutated, too: + +```aql +UPDATE doc WITH { + hobbies: PUSH(doc.hobbies, "swimming") +} IN users +``` + +If the attribute `hobbies` doesn't exist yet, it is conveniently initialized +as `[ "swimming" ]` and otherwise extended. + +## Query options + +You can optionally set query options for the `UPDATE` operation: + +```aql +UPDATE ... IN users OPTIONS { ... } +``` + +### `ignoreErrors` + +You can use `ignoreErrors` to suppress query errors that may occur when trying to +update non-existing documents or when violating unique key constraints: + +```aql +FOR i IN 1..1000 + UPDATE CONCAT("test", i) + WITH { foobar: true } IN users + OPTIONS { ignoreErrors: true } +``` + +You cannot modify the `_id`, `_key`, and `_rev` system attributes, but attempts +to change them are ignored and not considered errors. + +### `keepNull` + +When updating an attribute to the `null` value, ArangoDB does not remove the +attribute from the document but stores this `null` value. To remove attributes +in an update operation, set them to `null` and set the `keepNull` option to +`false`. This removes the attributes you specify but not any previously stored +attributes with the `null` value: + +```aql +FOR u IN users + UPDATE u WITH { foobar: true, notNeeded: null } IN users + OPTIONS { keepNull: false } +``` + +The above query removes the `notNeeded` attribute from the documents and updates +the `foobar` attribute normally. + +Only top-level attributes and sub-attributes can be removed this way +(e.g. `{ attr: { sub: null } }`) but not attributes of objects that are nested +inside of arrays (e.g. `{ attr: [ { nested: null } ] }`). + +### `mergeObjects` + +The option `mergeObjects` controls whether object contents are +merged if an object attribute is present in both the `UPDATE` query and in the +to-be-updated document. + +The following query sets the updated document's `name` attribute to the exact +same value that is specified in the query. This is due to the `mergeObjects` option +being set to `false`: + +```aql +FOR u IN users + UPDATE u WITH { + name: { first: "foo", middle: "b.", last: "baz" } + } IN users + OPTIONS { mergeObjects: false } +``` + +Contrary, the following query merges the contents of the `name` attribute in the +original document with the value specified in the query: + +```aql +FOR u IN users + UPDATE u WITH { + name: { first: "foo", middle: "b.", last: "baz" } + } IN users + OPTIONS { mergeObjects: true } +``` + +Attributes in `name` that are present in the to-be-updated document but not in the +query are preserved. Attributes that are present in both are overwritten +with the values specified in the query. + +Note: the default value for `mergeObjects` is `true`, so there is no need to specify it +explicitly. + +### `waitForSync` + +To make sure data are durable when an update query returns, there is the `waitForSync` +query option: + +```aql +FOR u IN users + UPDATE u WITH { foobar: true } IN users + OPTIONS { waitForSync: true } +``` + +### `ignoreRevs` + +In order to not accidentally overwrite documents that have been modified since you last fetched +them, you can use the option `ignoreRevs` to either let ArangoDB compare the `_rev` value and +only succeed if they still match, or let ArangoDB ignore them (default): + +```aql +FOR i IN 1..1000 + UPDATE { _key: CONCAT("test", i), _rev: "1287623" } + WITH { foobar: true } IN users + OPTIONS { ignoreRevs: false } +``` + +### `exclusive` + +The RocksDB engine does not require collection-level locks. Different write +operations on the same collection do not block each other, as +long as there are no _write-write conflicts_ on the same documents. From an application +development perspective it can be desired to have exclusive write access on collections, +to simplify the development. Note that writes do not block reads in RocksDB. +Exclusive access can also speed up modification queries, because we avoid conflict checks. + +Use the `exclusive` option to achieve this effect on a per query basis: + +```aql +FOR doc IN collection + UPDATE doc + WITH { updated: true } IN collection + OPTIONS { exclusive: true } +``` + +### `refillIndexCaches` + +Whether to update existing entries in in-memory index caches if document updates +affect the edge index or cache-enabled persistent indexes. + +```aql +UPDATE { _key: "123", _from: "vert/C", _to: "vert/D" } IN edgeColl + OPTIONS { refillIndexCaches: true } +``` + +## Returning the modified documents + +You can optionally return the documents modified by the query. In this case, the `UPDATE` +operation needs to be followed by a `RETURN` operation. Intermediate `LET` operations are +allowed, too. These operations can refer to the pseudo-variables `OLD` and `NEW`. +The `OLD` pseudo-variable refers to the document revisions before the update, and `NEW` +refers to the document revisions after the update. + +Both `OLD` and `NEW` contain all document attributes, even those not specified +in the update expression. + +```aql +UPDATE document IN collection options RETURN OLD +UPDATE document IN collection options RETURN NEW +UPDATE keyExpression WITH document IN collection options RETURN OLD +UPDATE keyExpression WITH document IN collection options RETURN NEW +``` + +Following is an example using a variable named `previous` to capture the original +documents before modification. For each modified document, the document key is returned. + +```aql +FOR u IN users + UPDATE u WITH { value: "test" } IN users + LET previous = OLD + RETURN previous._key +``` + +The following query uses the `NEW` pseudo-value to return the updated documents, +without some of the system attributes: + +```aql +FOR u IN users + UPDATE u WITH { value: "test" } IN users + LET updated = NEW + RETURN UNSET(updated, "_key", "_id", "_rev") +``` + +It is also possible to return both `OLD` and `NEW`: + +```aql +FOR u IN users + UPDATE u WITH { value: "test" } IN users + RETURN { before: OLD, after: NEW } +``` + +## Transactionality + +On a single server, updates are executed transactionally in an all-or-nothing +fashion. + +A query may execute intermediate transaction commits in case the running +transaction (AQL query) hits the specified size thresholds. In this case, the +query's operations carried out so far are committed and not rolled back in case +of a later abort/rollback. This behavior can be controlled by adjusting the +intermediate commit settings for the RocksDB engine. See +[Known limitations for AQL queries](../fundamentals/limitations.md#storage-engine-properties). + +For sharded collections, the entire query and/or update operation may not be +transactional, especially if it involves different shards and/or DB-Servers. diff --git a/site/content/arangodb/oem/aql/high-level-operations/upsert.md b/site/content/arangodb/oem/aql/high-level-operations/upsert.md new file mode 100644 index 0000000000..a4c705a249 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/upsert.md @@ -0,0 +1,276 @@ +--- +title: '`UPSERT` operation in AQL' +menuTitle: UPSERT +weight: 70 +description: >- + An `UPSERT` operation either modifies an existing document, or creates a new + document if it does not exist +--- +`UPSERT` looks up a single document that matches the provided example. +If there is no match, an insert operation is executed to create a +document. If a document is found, you can either update or replace the document. +These subtypes are called **upsert** (update or insert) and **repsert** +(replace or insert). + +Each `UPSERT` operation is restricted to a single collection, and the +[collection name](../../concepts/data-structure/collections.md#collection-names) must not be dynamic. +Only a single `UPSERT` statement per collection is allowed per AQL query, and +it cannot be followed by read or write operations that access the same collection, by +traversal operations, or AQL functions that can read documents. + +## Syntax + +The syntax for an upsert operation: + +
UPSERT searchExpression
+INSERT insertExpression
+UPDATE updateExpression
+IN collection
+ +The syntax for a repsert operation: + +
UPSERT searchExpression
+INSERT insertExpression
+REPLACE updateExpression
+IN collection
+ +Both variants can optionally end with an `OPTIONS { … }` clause. + +When using the `UPDATE` variant of the `UPSERT` operation, the found document +is partially updated, meaning only the attributes specified in +*updateExpression* are updated or added. When using the `REPLACE` variant +of `UPSERT` (repsert), the found document is replaced with the content of +*updateExpression*. + +Updating a document modifies the document's revision number with a server-generated value. +The system attributes `_id`, `_key`, and `_rev` cannot be updated, but `_from` and `_to` +can be modified. + +The *searchExpression* contains the document to be looked for. It must be an +**object literal** (`UPSERT { : , ... } ...`) without dynamic +attribute names. In case no such document can be found in *collection*, a new +document is inserted into the collection as specified in the *insertExpression*. + +In case at least one document in *collection* matches the *searchExpression*, it is +updated using the *updateExpression*. When more than one document in the collection +matches the *searchExpression*, it is undefined which of the matching documents is +updated. It is therefore often sensible to make sure by other means (such as unique +indexes, application logic etc.) that at most one document matches *searchExpression*. + +The following query looks for a document in the `users` collection with a specific +`name` attribute value. If the document exists, its *logins* attribute is increased +by one. If it does not exist, a new document is inserted, consisting of the +attributes `name`, `logins`, and `dateCreated`: + +```aql +UPSERT { name: 'superuser' } +INSERT { name: 'superuser', logins: 1, dateCreated: DATE_NOW() } +UPDATE { logins: OLD.logins + 1 } IN users +``` + +Note that in the `UPDATE` case it is possible to refer to the previous version of the +document using the `OLD` pseudo-value. + +## Query options + +### `ignoreErrors` + +The `ignoreErrors` option can be used to suppress query errors that may occur +when trying to violate unique key constraints. + +### `keepNull` + +When updating an attribute to the `null` value, ArangoDB does not remove the +attribute from the document but stores this `null` value. To remove attributes +in an update operation, set them to `null` and set the `keepNull` option to +`false`. This removes the attributes you specify but not any previously stored +attributes with the `null` value: + +```aql +UPSERT { _key: "mary" } +INSERT { _key: "mary", name: "Mary", notNeeded: 123 } +UPDATE { foobar: true, notNeeded: null } +IN users OPTIONS { keepNull: false } +``` + +If no document with the key `mary` exists, the above query creates such a user +document with a `notNeeded` attribute. If it exists already, it removes the +`notNeeded` attribute from the document and updates the `foobar` attribute +normally. + +Only top-level attributes and sub-attributes can be removed this way +(e.g. `{ attr: { sub: null } }`) but not attributes of objects that are nested +inside of arrays (e.g. `{ attr: [ { nested: null } ] }`). + +### `mergeObjects` + +The option `mergeObjects` controls whether object contents are +merged if an object attribute is present in both the `UPDATE` query and in the +to-be-updated document. + +{{< tip >}} +The default value for `mergeObjects` is `true`, so there is no need to specify it +explicitly. +{{< /tip >}} + +### `waitForSync` + +To make sure data are durable when an update query returns, there is the `waitForSync` +query option. + +### `ignoreRevs` + +In order to not accidentally update documents that have been written and updated since +you last fetched them you can use the option `ignoreRevs` to either let ArangoDB compare +the `_rev` value and only succeed if they still match, or let ArangoDB ignore them (default): + +```aql +FOR i IN 1..1000 + UPSERT { _key: CONCAT('test', i)} + INSERT {foobar: false} + UPDATE {_rev: "1287623", foobar: true } + IN users OPTIONS { ignoreRevs: false } +``` + +{{< info >}} +You need to add the `_rev` value in the *updateExpression*. It is not used +within the *searchExpression*. Even worse, if you use an outdated `_rev` in the +*searchExpression*, `UPSERT` triggers the `INSERT` path instead of the +`UPDATE` path, because it has not found a document exactly matching the +*searchExpression*. +{{< /info >}} + +### `exclusive` + +The RocksDB engine does not require collection-level locks. Different write +operations on the same collection do not block each other, as +long as there are no _write-write conflicts_ on the same documents. From an application +development perspective it can be desired to have exclusive write access on collections, +to simplify the development. Note that writes do not block reads in RocksDB. +Exclusive access can also speed up modification queries, because we avoid conflict checks. + +Use the `exclusive` option to achieve this effect on a per query basis: + +```aql +FOR i IN 1..1000 + UPSERT { _key: CONCAT('test', i) } + INSERT { foobar: false } + UPDATE { foobar: true } + IN users OPTIONS { exclusive: true } +``` + +### `indexHint` + +The `indexHint` option is used as a hint for the document lookup +performed as part of the `UPSERT` operation, and can help in cases such as +`UPSERT` not picking the best index automatically. + +```aql +UPSERT { a: 1234 } + INSERT { a: 1234, name: "AB" } + UPDATE { name: "ABC" } IN myCollection + OPTIONS { indexHint: "index_name" } +``` + +The index hint is passed through to an internal `FOR` loop that is used for the +lookup. Also see [`indexHint` Option of the `FOR` Operation](for.md#indexhint). + +Inverted indexes cannot be used for `UPSERT` lookups. + +### `forceIndexHint` + +Makes the index or indexes specified in `indexHint` mandatory if enabled. The +default is `false`. Also see +[`forceIndexHint` Option of the `FOR` Operation](for.md#forceindexhint). + +```aql +UPSERT { a: 1234 } + INSERT { a: 1234, name: "AB" } + UPDATE { name: "ABC" } IN myCollection + OPTIONS { indexHint: … , forceIndexHint: true } +``` + +## Returning documents + +`UPSERT` statements can optionally return data. To do so, they need to be followed +by a `RETURN` statement (intermediate `LET` statements are allowed, too). These statements +can optionally perform calculations and refer to the pseudo-values `OLD` and `NEW`. +In case the upsert performed an insert operation, `OLD` has a value of `null`. +In case the upsert performed an update or replace operation, `OLD` contains the +previous version of the document, before update/replace. + +`NEW` is always populated. It contains the inserted document in case the +upsert performed an insert, or the updated/replaced document in case it performed an +update/replace. + +This can also be used to check whether the upsert has performed an insert or an update +internally: + +```aql +UPSERT { name: 'superuser' } +INSERT { name: 'superuser', logins: 1, dateCreated: DATE_NOW() } +UPDATE { logins: OLD.logins + 1 } IN users +RETURN { doc: NEW, type: OLD ? 'update' : 'insert' } +``` + +## Transactionality and Limitations + +- On a single server, upserts are generally executed transactionally in an + all-or-nothing fashion. + + For sharded collections in cluster deployments, the entire query and/or upsert + operation may not be transactional, especially if it involves different shards, + DB-Servers, or both. + +- Queries may execute intermediate transaction commits in case the running + transaction (AQL query) hits the specified size thresholds. This writes the + data that has been modified so far and it is not rolled back in case of a later + abort/rollback of the transaction. + + Such **intermediate commits** can occur for `UPSERT` operations over all + documents of a large collection, for instance. This has the side-effect that + atomicity of this operation cannot be guaranteed anymore and ArangoDB cannot + guarantee that "read your own writes" in upserts work. + + This is only an issue if you write a query where your search condition would + hit the same document multiple times, and only if you have large transactions. + You can adjust the behavior of the RocksDB storage engine by increasing the + `intermediateCommit` thresholds for data size and operation counts. + +- The lookup and the insert/update/replace parts are executed one after + another, so that other operations in other threads can happen in + between. This means if multiple `UPSERT` queries run concurrently, they + may all determine that the target document does not exist and then + create it multiple times! + + Note that due to this gap between the lookup and insert/update/replace, + even with a unique index, duplicate key errors or conflicts can occur. + But if they occur, the application/client code can execute the same query + again. + + To prevent this from happening, you should add a unique index to the lookup + attribute(s). Note that in the cluster a unique index can only be created if + it is equal to the shard key attribute of the collection or at least contains + it as a part. + + An alternative to making an UPSERT statement work atomically is to use the + `exclusive` option to limit write concurrency for this collection to 1, which + helps avoiding conflicts but is bad for throughput! + +- `UPSERT` operations do not observe their own writes correctly in cluster + deployments. They only do for OneShard databases with the `cluster-one-shard` + optimizer rule active. + + If upserts in a query create new documents and would then semantically hit the + same documents again, the operation may incorrectly use the `INSERT` branch to + create more documents instead of the `UPDATE`/`REPLACE` branch to update the + previously created documents. + + If upserts find existing documents for updating/replacing, you can access the + current document via the `OLD` pseudo-variable, but this may hold the initial + version of the document from before the query even if it has been modified + by `UPSERT` in the meantime. + +- The lookup attribute(s) from the search expression should be indexed in order + to improve the `UPSERT` performance. Ideally, the search expression contains the + shard key, as this allows the lookup to be restricted to a single shard. diff --git a/site/content/arangodb/oem/aql/high-level-operations/window.md b/site/content/arangodb/oem/aql/high-level-operations/window.md new file mode 100644 index 0000000000..494345d4d7 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/window.md @@ -0,0 +1,282 @@ +--- +title: '`WINDOW` operation in AQL' +menuTitle: WINDOW +weight: 45 +description: >- + Aggregate adjacent documents or value ranges with a sliding window to + calculate running totals, rolling averages, and other statistical properties +--- +The `WINDOW` operation can be used for aggregations over adjacent documents, or +preceding and / or following rows in other words. It can also aggregate based +on a value or duration range relative to a document attribute. + +The operation performs a `COLLECT AGGREGATE`-like operation on a set +of query rows. However, whereas a `COLLECT` operation groups multiple query +rows into a single result group, a `WINDOW` operation produces a result for +each query row: + +- The row for which function evaluation occurs is called the current row. +- The query rows related to the current row over which function evaluation + occurs, comprise the window frame for the current row. + +Window frames are determined with respect to the current row: + +- By defining a window frame to be all rows from the query start to the current + row, you can compute running totals for each row. +- By defining a frame as extending *N* rows on either side of the current row, + you can compute rolling averages. + +## Syntax + +There are two syntax variants for `WINDOW` operations. + +**Row-based** (adjacent documents): + +
WINDOW { preceding: numPrecedingRows, following: numFollowingRows } AGGREGATE variableName = aggregateExpression
+ +**Range-based** (value or duration range): + +
WINDOW rangeValue WITH { preceding: offsetPreceding, following: offsetFollowing } AGGREGATE variableName = aggregateExpression
+ +Calls to the following functions are supported in aggregation expressions: +- `LENGTH()` / `COUNT()` +- `MIN()` +- `MAX()` +- `SUM()` +- `AVERAGE()` / `AVG()` +- `STDDEV_POPULATION()` / `STDDEV()` +- `STDDEV_SAMPLE()` +- `VARIANCE_POPULATION()` / `VARIANCE()` +- `VARIANCE_SAMPLE()` +- `UNIQUE()` +- `SORTED_UNIQUE()` +- `COUNT_DISTINCT()` / `COUNT_UNIQUE()` +- `BIT_AND()` +- `BIT_OR()` +- `BIT_XOR()` + +## Row-based Aggregation + +The first syntax form of `WINDOW` allows aggregating over a fixed number of +rows, following or preceding the current row. It is also possible to define +that **all** preceding or following rows should be aggregated (`"unbounded"`). +The number of rows has to be determined at query compile time. + +Below query demonstrates the use of window frames to compute **running totals** +as well as **rolling averages** computed from the current row and the rows that +immediately precede and follow it: + +```aql +--- +name: windowAggregationRow +description: '' +dataset: observationsSampleDataset +--- +FOR t IN observations + SORT t.time + WINDOW { preceding: 1, following: 1 } + AGGREGATE rollingAverage = AVG(t.val), rollingSum = SUM(t.val) + WINDOW { preceding: "unbounded", following: 0} + AGGREGATE cumulativeSum = SUM(t.val) + RETURN { + time: t.time, + subject: t.subject, + val: t.val, + rollingAverage, // average of the window's values + rollingSum, // sum of the window's values + cumulativeSum // running total + } +``` + +The row order is controlled by the `SORT` operation on the `time` attribute. + +The first `WINDOW` operation aggregates the previous, current, and next row +(preceding and following is set to 1) and calculates the average and sum of +these three values. In case of the first row, there is no preceding row but a +following row, hence the values `10` and `0` are added up to calculate the sum, +which is divided by 2 to compute the average. For the second row, the values +`10`, `0` and `9` are summed up and divided by 3, and so on. + +The second `WINDOW` operation aggregates all previous values (unbounded) to +calculate a running sum. For the first row, that is just `10`, for the second +row it is `10` + `0`, for the third `10` + `0` + `9`, and so on. + +| time | subject | val | rollingAverage | rollingSum | cumulativeSum | +|---------------------|---------|----:|---------------:|-----------:|--------------:| +| 2021-05-25 07:00:00 | st113 | 10 | 5 | 10 | 10 | +| 2021-05-25 07:00:00 | xh458 | 0 | 6.333… | 19 | 10 | +| 2021-05-25 07:15:00 | st113 | 9 | 6.333… | 19 | 19 | +| 2021-05-25 07:15:00 | xh458 | 10 | 14.666… | 44 | 29 | +| 2021-05-25 07:30:00 | st113 | 25 | 13.333… | 40 | 54 | +| 2021-05-25 07:30:00 | xh458 | 5 | 16.666… | 50 | 59 | +| 2021-05-25 07:45:00 | st113 | 20 | 18.333… | 55 | 79 | +| 2021-05-25 07:45:00 | xh458 | 30 | 25 | 75 | 109 | +| 2021-05-25 08:00:00 | xh458 | 25 | 27.5 | 55 | 134 | + +The below query demonstrates the use of window frames to compute running totals +within each `subject` group of `time`-ordered query rows, as well as rolling +sums and averages computed from the current row and the rows that immediately +precede and follow it, also per `subject` group and sorted by `time`: + +```aql +--- +name: windowAggregationRowGrouped +description: '' +dataset: observationsSampleDataset +--- +FOR t IN observations + COLLECT subject = t.subject INTO group = t + LET subquery = (FOR t2 IN group + SORT t2.time + WINDOW { preceding: 1, following: 1 } + AGGREGATE rollingAverage = AVG(t2.val), rollingSum = SUM(t2.val) + WINDOW { preceding: "unbounded", following: 0 } + AGGREGATE cumulativeSum = SUM(t2.val) + RETURN { + time: t2.time, + subject: t2.subject, + val: t2.val, + rollingAverage, + rollingSum, + cumulativeSum + } + ) + // flatten subquery result + FOR t2 IN subquery + RETURN t2 +``` + +If you look at the first row with the subject `xh458`, then you can see the +cumulative sum reset and that the rolling average and sum does not take the +previous row into account that belongs to subject `st113`. + +| time | subject | val | rollingAverage | rollingSum | cumulativeSum | +|---------------------|---------|----:|---------------:|-----------:|--------------:| +| 2021-05-25 07:00:00 | st113 | 10 | 9.5 | 19 | 10 | +| 2021-05-25 07:15:00 | st113 | 9 | 14.666… | 44 | 19 | +| 2021-05-25 07:30:00 | st113 | 25 | 18 | 54 | 44 | +| 2021-05-25 07:45:00 | st113 | 20 | 22.5 | 45 | 64 | +| 2021-05-25 07:00:00 | xh458 | 0 | 5 | 10 | 0 | +| 2021-05-25 07:15:00 | xh458 | 10 | 5 | 15 | 10 | +| 2021-05-25 07:30:00 | xh458 | 5 | 15 | 45 | 15 | +| 2021-05-25 07:45:00 | xh458 | 30 | 20 | 60 | 45 | +| 2021-05-25 08:00:00 | xh458 | 25 | 27.5 | 55 | 70 | + +## Range-based Aggregation + +The second syntax form of `WINDOW` allows aggregating over a all documents +within a value range. Offsets are differences in attribute values from the +current document. + +Attribute values have to be numeric. The offset calculations are performed by +adding or subtracting the numeric offsets specified in the `following` and +`preceding` attribute. The offset numbers have to be positive and have to be +determined at query compile time. The default offset is `0`. + +The range based window syntax requires the input rows to be sorted by the row +value. To ensure correctness of the result, the AQL optimizer will +automatically insert a `SORT` statement into the query in front of the `WINDOW` +statement. The optimizer may be able to optimize away that `SORT` statement +later if a sorted index is present on the group criteria. + +The following query demonstrates the use of window frames to compute totals as +well as averages computed from the current document and the documents that have +attribute values in `t.val` in the range of `[-10, +5]` (inclusive), preceding +and following: + +```aql +--- +name: windowAggregationRangeValue +description: '' +dataset: observationsSampleDataset +--- +FOR t IN observations + WINDOW t.val WITH { preceding: 10, following: 5 } + AGGREGATE rollingAverage = AVG(t.val), rollingSum = SUM(t.val) + RETURN { + time: t.time, + subject: t.subject, + val: t.val, + rollingAverage, + rollingSum + } +``` + +The value range of the first row is `[-10, 5]` since `val` is `0`, thus the +values from the first and second row are added up to `5` with the average being +`2.5`. The value range of the last row is `[20, 35]` as `val` is `30`, which +means that the last four rows get aggregated to a sum of `100` and an average +of `25` (the range is inclusive, i.e. `val` falls within the range with a value +of `20`). + +| time | subject | val | rollingAverage | rollingSum | +|---------------------|---------|----:|---------------:|-----------:| +| 2021-05-25 07:00:00 | xh458 | 0 | 2.5 | 5 | +| 2021-05-25 07:30:00 | xh458 | 5 | 6.8 | 34 | +| 2021-05-25 07:15:00 | st113 | 9 | 6.8 | 34 | +| 2021-05-25 07:00:00 | st113 | 10 | 6.8 | 34 | +| 2021-05-25 07:15:00 | xh458 | 10 | 6.8 | 34 | +| 2021-05-25 07:45:00 | st113 | 20 | 18 | 90 | +| 2021-05-25 07:30:00 | st113 | 25 | 25 | 100 | +| 2021-05-25 08:00:00 | xh458 | 25 | 25 | 100 | +| 2021-05-25 07:45:00 | xh458 | 30 | 25 | 100 | + +## Duration-based Aggregation + +Aggregating by time intervals is a subtype of range-based aggregation that +uses the second syntax form of `WINDOW` but with ISO durations. + +To support `WINDOW` frames over time-series data the `WINDOW` operation may +calculate timestamp offsets using positive ISO 8601 duration strings, like +`P1Y6M` (1 year and 6 months) or `PT12H30M` (12 hours and 30 minutes). Also see +[Date functions](../functions/date.md#comparison-and-calculation). +In contrast to the ISO 8601 standard, week components may be freely combined +with other components. For example, `P1WT1H` and `P1M1W` are both valid. +Fractional values are only supported for seconds, and only with up to three +decimals after the separator, i.e., millisecond precision. For example, +`PT0.123S` is a valid duration while `PT0.5H` and `PT0.1234S` are not. + +Durations can be specified separately in `following` and `preceding`. +If such a duration is used, then the attribute value of the current document +must be a number and is treated as numeric **timestamp in milliseconds**. +The range is inclusive. If either bound is not specified, it is treated as an +empty duration (i.e., `P0D`). + +The following query demonstrates the use of window frames to compute rolling +sums and averages over observations in the last 30 minutes (inclusive), based +on the document attribute `time` that is converted from a datetime string to a +numeric timestamp: + +```aql +--- +name: windowAggregationRangeDuration +description: '' +dataset: observationsSampleDataset +--- +FOR t IN observations + WINDOW DATE_TIMESTAMP(t.time) WITH { preceding: "PT30M" } + AGGREGATE rollingAverage = AVG(t.val), rollingSum = SUM(t.val) + RETURN { + time: t.time, + subject: t.subject, + val: t.val, + rollingAverage, + rollingSum + } +``` + +With a time of `07:30:00`, everything from `07:00:00` to `07:30:00` on the same +day falls within the duration range with `preceding: "PT30M"`, thus aggregating +the top six rows to a sum of `59` and an average of `9.8333…`. + +| time | subject | val | rollingAverage | rollingSum | +|---------------------|---------|----:|---------------:|-----------:| +| 2021-05-25 07:00:00 | st113 | 10 | 5 | 10 | +| 2021-05-25 07:00:00 | xh458 | 0 | 5 | 10 | +| 2021-05-25 07:15:00 | st113 | 9 | 7.25 | 29 | +| 2021-05-25 07:15:00 | xh458 | 10 | 7.25 | 29 | +| 2021-05-25 07:30:00 | st113 | 25 | 9.8333… | 59 | +| 2021-05-25 07:30:00 | xh458 | 5 | 9.8333… | 59 | +| 2021-05-25 07:45:00 | st113 | 20 | 16.5 | 99 | +| 2021-05-25 07:45:00 | xh458 | 30 | 16.5 | 99 | +| 2021-05-25 08:00:00 | xh458 | 25 | 21 | 105 | diff --git a/site/content/arangodb/oem/aql/high-level-operations/with.md b/site/content/arangodb/oem/aql/high-level-operations/with.md new file mode 100644 index 0000000000..d14c1e67b0 --- /dev/null +++ b/site/content/arangodb/oem/aql/high-level-operations/with.md @@ -0,0 +1,71 @@ +--- +title: '`WITH` operation in AQL' +menuTitle: WITH +weight: 75 +description: >- + An AQL query can start with a `WITH` operation, listing collections that a + query implicitly reads from +--- +Reading implicitly from a collections means that the collections are not +specified explicitly in language constructs like the following: + +- `FOR ... IN collection` +- `INSERT ... INTO collection` +- `UPDATE ... IN collection` +- `GRAPH "graph-name"` (via the graph definition) + +Instead, the collections are only known at runtime of the query. Such dynamic +collection access is invisible to the AQL query parser at query compile time. +Dynamic access is possible via the `DOCUMENT()` function as well as with +graph traversals (in particular the variant using collection sets), because +edges may point to arbitrary vertex collections. Additionally, if you specify +the start vertex of a traversal using a string, its collection needs to be +declared as well. + +Collections that are explicitly used in a query are automatically detected by +the AQL query parser. Any additional collections that will be involved in the +query but cannot be detected automatically by the query parser can be manually +specified using a `WITH` statement. It is recommended to declare all collections +that the `DOCUMENT()` function or graph traversals using collection sets might +possibly access to avoid occasional query failures. + +## Syntax + +
WITH collection1 [, collection2 [, ... collectionN ] ]
+ +`WITH` is also a keyword that is used in other contexts, for example in `UPDATE` +statements. To declare additional collections, you must place the `WITH` keyword +at the very start of the query. + +## Usage + +The `WITH` operation is only required if you use a cluster deployment and only +for AQL queries that dynamically read from vertex collections as part of +graph traversals. + +You can enable the `--query.require-with` startup option to make single server +instances require `WITH` declarations like cluster deployments to ease development, +see [Requiring `WITH` statements](../../components/arangodb-server/options.md#--queryrequire-with). + +Dynamic access via the `DOCUMENT()` function does not require you to list the +involved collections. Using named graphs in traversals (`GRAPH "graph-name"`) +does not require it either, assuming that all vertices are in collections that +are part of the graph, as enforced by the [Graph API](../../develop/http-api/graphs/named-graphs.md). +That means, it is only necessary for traversals using anonymous graphs / +[collection sets](../graphs/traversals.md#working-with-collection-sets). + +The following example query specifies an edge collection `usersHaveManagers` +to perform a graph traversal. It is the only explicitly specified collection in +the query. It does not need to be declared using the `WITH` operation. + +However, the involved vertex collections need to be declared. In this example, +the start vertex is specified as a string and it is stored in the `users` +collections. Furthermore, the edges of the edge collection reference vertices of +a collection called `managers`. Both collections are declared at the beginning +of the query using the `WITH` operation: + +```aql +WITH users, managers +FOR v, e, p IN 1..2 OUTBOUND 'users/1' usersHaveManagers + RETURN { v, e, p } +``` diff --git a/site/content/arangodb/oem/aql/how-to-invoke-aql/_index.md b/site/content/arangodb/oem/aql/how-to-invoke-aql/_index.md new file mode 100644 index 0000000000..d8c7d28e57 --- /dev/null +++ b/site/content/arangodb/oem/aql/how-to-invoke-aql/_index.md @@ -0,0 +1,30 @@ +--- +title: How to execute AQL queries +menuTitle: How to invoke AQL +weight: 5 +description: '' +--- +AQL queries can be invoked in the following ways: + +- Via the web interface +- Using the `db` object of the JavaScript API, for example, in arangosh or in a Foxx service +- Via the raw REST HTTP API + +There are always calls to the server's HTTP API under the hood, but the web interface +and the `db` object abstract away the low-level communication details and are +thus easier to use. + +The ArangoDB web interface has a specific section for [**QUERIES**](with-the-web-interface.md). + +You can run [AQL queries from the ArangoDB Shell](with-arangosh.md) +with the [`db._query()`](with-arangosh.md#with-db_query) and +[`db._createStatement()`](with-arangosh.md#with-db_createstatement-arangostatement) +methods of the [`db` object](../../develop/javascript-api/@arangodb/db-object.md). This chapter +also describes how to use bind parameters, statistics, counting, and cursors with +arangosh. + +If you use Foxx microservices, see [how to write database queries](../../develop/foxx-microservices/getting-started.md#writing-database-queries) +for examples including tagged template strings. + +If you want to run AQL queries from your application via the HTTP REST API, +see the full API description at [HTTP interface for AQL queries](../../develop/http-api/queries/aql-queries.md). diff --git a/site/content/arangodb/oem/aql/how-to-invoke-aql/with-arangosh.md b/site/content/arangodb/oem/aql/how-to-invoke-aql/with-arangosh.md new file mode 100644 index 0000000000..c430c0efce --- /dev/null +++ b/site/content/arangodb/oem/aql/how-to-invoke-aql/with-arangosh.md @@ -0,0 +1,786 @@ +--- +title: Executing AQL queries from _arangosh_ +menuTitle: with arangosh +weight: 5 +description: >- + How to run queries, set bind parameters, and obtain the resulting and + additional information using the JavaScript API +# Undocumented on purpose: +# db._query(, , , { forceOneShardAttributeValue: "..."} ) +--- +In the ArangoDB shell, you can use the `db._query()` and `db._createStatement()` +methods to execute AQL queries. This chapter also describes +how to use bind parameters, counting, statistics and cursors. + +## With `db._query()` + +`db._query() → cursor` + +You can execute queries with the `_query()` method of the `db` object. +This runs the specified query in the context of the currently +selected database and returns the query results in a cursor. +You can print the results of the cursor using its `toArray()` method: + +```js +--- +name: 01_workWithAQL_all +description: '' +--- +~addIgnoreCollection("mycollection") +var coll = db._create("mycollection") +var doc = db.mycollection.save({ _key: "testKey", Hello : "World" }) +db._query('FOR my IN mycollection RETURN my._key').toArray() +``` + +### `db._query()` bind parameters + +`db._query(, ) → cursor` + +To pass bind parameters into a query, you can specify a second argument when +calling the `_query()` method: + +```js +--- +name: 02_workWithAQL_bindValues +description: '' +--- +db._query('FOR c IN @@collection FILTER c._key == @key RETURN c._key', { + '@collection': 'mycollection', + 'key': 'testKey' +}).toArray(); +``` + +### ES6 template strings + +`` aql`` `` + +It is also possible to use ES6 template strings for generating AQL queries. There is +a template string generator function named `aql`. + +The following example demonstrates what the template string function generates: + +```js +--- +name: 02_workWithAQL_aqlTemplateString +description: '' +--- +var key = 'testKey'; +aql`FOR c IN mycollection FILTER c._key == ${key} RETURN c._key` +``` + +The next example directly uses the generated result to execute a query: + +```js +--- +name: 02_workWithAQL_aqlQuery +description: '' +--- +var key = 'testKey'; +db._query( + aql`FOR c IN mycollection FILTER c._key == ${key} RETURN c._key` +).toArray(); +``` + +Arbitrary JavaScript expressions can be used in queries that are generated with the +`aql` template string generator. Collection objects are handled automatically: + +```js +--- +name: 02_workWithAQL_aqlCollectionQuery +description: '' +--- +var key = 'testKey'; +db._query(aql`FOR doc IN ${ db.mycollection } RETURN doc`).toArray(); +``` + +Note: data-modification AQL queries normally do not return a result unless the +AQL query contains a `RETURN` operation at the top-level. Without a `RETURN` +operation, the `toArray()` method returns an empty array. + +### Statistics and extra Information + +`cursor.getExtra() → queryInfo` + +It is always possible to retrieve statistics for a query with the `getExtra()` method: + +```js +--- +name: 03_workWithAQL_getExtra +description: '' +--- +db._query(` + FOR i IN 1..100 + INSERT { _key: CONCAT('test', TO_STRING(i)) } INTO mycollection +`).getExtra(); +``` + +The meaning of the statistics values is described in +[Query statistics](../execution-and-performance/query-statistics.md). + +Query warnings are also reported here. If you design queries on the shell, +be sure to check for warnings. + +### Main query options + +`db._query(, , , ) → cursor` + +You can pass the main options as the third argument to `db._query()` if you +also pass a fourth argument with the sub options (can be an empty object `{}`). + +#### `count` + +Whether the number of documents in the result set should be calculated on the +server side and returned in the `count` attribute of the result. Calculating the +`count` attribute might have a performance impact for some queries so this +option is turned off by default, and only returned when requested. + +If enabled, you can get the count by calling the `count()` method of the cursor. +You can also count the number of results on the client side, for example, using +`cursor.toArray().length`. + +```js +--- +name: 02_workWithAQL_count +description: '' +--- +var cursor = db._query( + 'FOR i IN 1..42 RETURN i', + {}, + { count: true }, + {} +); +cursor.count(); +cursor.toArray().length; +``` + +#### `batchSize` + +The maximum number of result documents to be transferred from the server to the +client in one roundtrip. If this attribute is not set, a server-controlled +default value is used. A `batchSize` value of `0` is disallowed. + +```js +--- +name: 02_workWithAQL_batchSize +description: '' +--- +db._query( + 'FOR i IN 1..3 RETURN i', + {}, + { batchSize: 2 }, + {} +).toArray(); // full result retrieved in two batches +``` + +#### `ttl` + +The time-to-live for the cursor (in seconds). If the result set is small enough +(less than or equal to `batchSize`), then results are returned right away. +Otherwise, they are stored in memory and are accessible via the cursor with +respect to the `ttl`. The cursor is removed on the server automatically after +the specified amount of time. This is useful to ensure garbage collection of +cursors that are not fully fetched by clients. If not set, a server-defined +value is used (default: 30 seconds). + +```js +--- +name: 02_workWithAQL_ttl +description: '' +--- +db._query( + 'FOR i IN 1..20 RETURN i', + {}, + { ttl: 5, batchSize: 10 }, + {} +).toArray(); // Each batch needs to be fetched within 5 seconds +``` + +#### `memoryLimit` + +To set a memory limit for the query, pass `options` to the `_query()` method. +The memory limit specifies the maximum number of bytes that the query is +allowed to use. When a single AQL query reaches the specified limit value, +the query will be aborted with a *resource limit exceeded* exception. In a +cluster, the memory accounting is done per shard, so the limit value is +effectively a memory limit per query per shard. + +```js +--- +name: 02_workWithAQL_memoryLimit +description: '' +--- +db._query( + 'FOR i IN 1..100000 SORT i RETURN i', + {}, + { memoryLimit: 100000 } +).toArray(); // xpError(ERROR_RESOURCE_LIMIT) +``` + +If no memory limit is specified, then the server default value (controlled by +the `--query.memory-limit` startup option) is used for restricting the maximum amount +of memory the query can use. A memory limit value of `0` means that the maximum +amount of memory for the query is not restricted. + +### Query sub options + +`db._query(, , ) → cursor` + +`db._query(, , , ) → cursor` + +You can pass the sub options as the third argument to `db._query()` if you don't +provide main options, or as fourth argument if you do. + +#### `fullCount` + +If you set `fullCount` to `true` and if the query contains a `LIMIT` operation, then the +result has an extra attribute with the sub-attributes `stats` and `fullCount`, like +`{ ... , "extra": { "stats": { "fullCount": 123 } } }`. The `fullCount` attribute +contains the number of documents in the result before the last top-level `LIMIT` in the +query was applied. It can be used to count the number of documents that match certain +filter criteria, but only return a subset of them, in one go. It is thus similar to +MySQL's `SQL_CALC_FOUND_ROWS` hint. Note that setting the option disables a few +`LIMIT` optimizations and may lead to more documents being processed, and thus make +queries run longer. Note that the `fullCount` attribute may only be present in the +result if the query has a top-level `LIMIT` operation and the `LIMIT` operation +is actually used in the query. + +#### `failOnWarning` +If you set `failOnWarning` to `true`, this makes the query throw an exception and +abort in case a warning occurs. You should use this option in development to catch +errors early. If set to `false`, warnings don't propagate to exceptions and are +returned with the query results. There is also a `--query.fail-on-warning` +startup options for setting the default value for `failOnWarning`, so that you +don't need to set it on a per-query level. + +#### `cache` + +Whether the [AQL query results cache](../execution-and-performance/caching-query-results.md) +shall be used for adding as well as for retrieving results. + +If the query cache mode is set to `demand` and you set the `cache` query option +to `true` for a query, then its query result is cached if it's eligible for +caching. If the query cache mode is set to `on`, query results are automatically +cached if they are eligible for caching unless you set the `cache` option to `false`. + +If you set the `cache` option to `false`, then any query cache lookup is skipped +for the query. If you set it to `true`, the query cache is checked a cached result +**if** the query cache mode is either set to `on` or `demand`. + +```js +--- +name: 02_workWithAQL_cache +description: '' +--- +var resultCache = require("@arangodb/aql/cache"); +resultCache.properties({ mode: "demand" }); +~resultCache.clear(); +db._query("FOR i IN 1..5 RETURN i", {}, { cache: true }); // Adds result to cache +db._query("FOR i IN 1..5 RETURN i", {}, { cache: true }); // Retrieves result from cache +db._query("FOR i IN 1..5 RETURN i", {}, { cache: false }); // Bypasses the cache +``` + +#### `fillBlockCache` + +If you set `fillBlockCache` to `true` or not specify it, this makes the query store +the data it reads via the RocksDB storage engine in the RocksDB block cache. This is +usually the desired behavior. You can set the option to `false` for queries that are +known to either read a lot of data that would thrash the block cache, or for queries +that read data known to be outside of the hot set. By setting the option +to `false`, data read by the query does not make it into the RocksDB block cache if +it is not already in there, thus leaving more room for the actual hot set. + +#### `profile` + +If you set `profile` to `true` or `1`, extra timing information is returned for the query. +The timing information is accessible via the `getExtra()` method of the query +result. If set to `2`, the query includes execution statistics per query plan +execution node in `stats.nodes` sub-attribute of the `extra` return attribute. +Additionally, the query plan is returned in the `extra.plan` sub-attribute. + +#### `maxWarningCount` + +The `maxWarningCount` option limits the number of warnings that are returned by the query if +`failOnWarning` is not set to `true`. The default value is `10`. + +#### `maxNumberOfPlans` + +The `maxNumberOfPlans` option limits the number of query execution plans the optimizer +creates at most. Reducing the number of query execution plans may speed up query plan +creation and optimization for complex queries, but normally there is no need to adjust +this value. + +#### `optimizer` + +Options related to the query optimizer. + +- `rules`: A list of to-be-included or to-be-excluded optimizer rules can be put into + this attribute, telling the optimizer to include or exclude specific rules. To disable + a rule, prefix its name with a `-`, to enable a rule, prefix it with a `+`. There is also + a pseudo-rule `all`, which matches all optimizer rules. `-all` disables all rules. + +#### `allowRetry` + +Set this option to `true` to make it possible to retry fetching the latest batch +from a cursor. + +{{< info >}} +This feature cannot be used on the server-side, like in [Foxx](../../develop/foxx-microservices/_index.md), as +there is no client connection and no batching. +{{< /info >}} + +If retrieving a result batch fails because of a connection issue, you can ask +for that batch again using the `POST /_api/cursor//` +endpoint. The first batch has an ID of `1` and the value is incremented by 1 +with every batch. Every result response except the last one also includes a +`nextBatchId` attribute, indicating the ID of the batch after the current. +You can remember and use this batch ID should retrieving the next batch fail. + +You can only request the latest batch again (or the next batch). +Earlier batches are not kept on the server-side. +Requesting a batch again does not advance the cursor. + +You can also call this endpoint with the next batch identifier, i.e. the value +returned in the `nextBatchId` attribute of a previous request. This advances the +cursor and returns the results of the next batch. This is only supported if there +are more results in the cursor (i.e. `hasMore` is `true` in the latest batch). + +From v3.11.1 onward, you may use the `POST /_api/cursor//` +endpoint even if the `allowRetry` attribute is `false` to fetch the next batch, +but you cannot request a batch again unless you set it to `true`. + +To allow refetching of the last batch of the query, the server cannot +automatically delete the cursor. After the first attempt of fetching the last +batch, the server would normally delete the cursor to free up resources. As you +might need to reattempt the fetch, it needs to keep the final batch when the +`allowRetry` option is enabled. Once you successfully received the last batch, +you should call the `DELETE /_api/cursor/` endpoint so that the +server doesn't unnecessarily keep the batch until the cursor times out +(`ttl` query option). + +#### `stream` + +Set `stream` to `true` to execute the query in a **streaming** fashion. +The query result is not stored on the server, but calculated on the fly. + +{{< warning >}} +Long-running queries need to hold the collection locks for as long as the query +cursor exists. It is advisable to **only** use this option on short-running +queries **or** without exclusive locks. +{{< /warning >}} + +If set to `false`, the query is executed right away in its entirety. +In that case, the query results are either returned right away (if the result +set is small enough), or stored on the arangod instance and can be accessed +via the cursor API. + +The default value is `false`. + +{{< info >}} +The query options `cache`, `count` and `fullCount` don't work on streaming +queries. Additionally, query statistics, profiling data, and warnings are only +available after the query has finished and are delivered as part of the last batch. +{{< /info >}} + +#### `maxRuntime` + +The query has to be executed within the given runtime or it is killed. +The value is specified in seconds. The default value is `0.0` (no timeout). + +#### `maxDNFConditionMembers` + +Introduced in: v3.11.0 + +A threshold for the maximum number of `OR` sub-nodes in the internal +representation of an AQL `FILTER` condition. + +Yon can use this option to limit the computation time and memory usage when +converting complex AQL `FILTER` conditions into the internal DNF +(disjunctive normal form) format. `FILTER` conditions with a lot of logical +branches (`AND`, `OR`, `NOT`) can take a large amount of processing time and +memory. This query option limits the computation time and memory usage for +such conditions. + +Once the threshold value is reached during the DNF conversion of a `FILTER` +condition, the conversion is aborted, and the query continues with a simplified +internal representation of the condition, which **cannot be used for index lookups**. + +You can also set the threshold globally instead of per query with the +[`--query.max-dnf-condition-members` startup option](../../components/arangodb-server/options.md#--querymax-dnf-condition-members). + +#### `maxNodesPerCallstack` + +The number of execution nodes in the query plan after +that stack splitting is performed to avoid a potential stack overflow. +Defaults to the configured value of the startup option +`--query.max-nodes-per-callstack`. + +This option is only useful for testing and debugging and normally does not need +any adjustment. + +#### `maxTransactionSize` + +The transaction size limit in bytes. + +#### `intermediateCommitSize` + +The maximum total size of operations after which an intermediate +commit is performed automatically. + +#### `intermediateCommitCount` + +The maximum number of operations after which an intermediate +commit is performed automatically. + +#### `spillOverThresholdMemoryUsage` + +Introduced in: v3.10.0 + +This option allows queries to store intermediate and final results temporarily +on disk if the amount of memory used (in bytes) exceeds the specified value. +This is used for decreasing the memory usage during the query execution. + +This option only has an effect on queries that use the `SORT` operation but +without a `LIMIT`, and if you enable the spillover feature by setting a path +for the directory to store the temporary data in with the +[`--temp.intermediate-results-path` startup option](../../components/arangodb-server/options.md#--tempintermediate-results-path). + +Default value: 128MB. + +{{< info >}} +Spilling data from RAM onto disk is an experimental feature and is turned off +by default. The query results are still built up entirely in RAM on Coordinators +and single servers for non-streaming queries. To avoid the buildup of +the entire query result in RAM, use a streaming query (see the +[`stream`](#stream) option). +{{< /info >}} + +#### `spillOverThresholdNumRows` + +Introduced in: v3.10.0 + +This option allows queries to store intermediate and final results temporarily +on disk if the number of rows produced by the query exceeds the specified value. +This is used for decreasing the memory usage during the query execution. In a +query that iterates over a collection that contains documents, each row is a +document, and in a query that iterates over temporary values +(i.e. `FOR i IN 1..100`), each row is one of such temporary values. + +This option only has an effect on queries that use the `SORT` operation but +without a `LIMIT`, and if you enable the spillover feature by setting a path +for the directory to store the temporary data in with the +[`--temp.intermediate-results-path` startup option](../../components/arangodb-server/options.md#--tempintermediate-results-path). + +Default value: `5000000` rows. + +{{< info >}} +Spilling data from RAM onto disk is an experimental feature and is turned off +by default. The query results are still built up entirely in RAM on Coordinators +and single servers for non-streaming queries. To avoid the buildup of +the entire query result in RAM, use a streaming query (see the +[`stream`](#stream) option). +{{< /info >}} + +#### `allowDirtyReads` + +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +Introduced in: v3.10.0 + +If you set this option to `true` and execute the query against a cluster +deployment, then the Coordinator is allowed to read from any shard replica and +not only from the leader. See [Read from followers](../../develop/http-api/documents.md#read-from-followers) +for details. + +#### `skipInaccessibleCollections` + +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +Let AQL queries (especially graph traversals) treat collection to which a +user has **no access** rights for as if these collections are empty. +Instead of returning a *forbidden access* error, your queries execute normally. +This is intended to help with certain use-cases: A graph contains several collections +and different users execute AQL queries on that graph. You can naturally limit the +accessible results by changing the access rights of users on collections. + +#### `satelliteSyncWait` + +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +Configure how long a DB-Server has time to bring the SatelliteCollections +involved in the query into sync. The default value is `60.0` seconds. +When the maximal time is reached, the query is stopped. + +## With `db._createStatement()` (ArangoStatement) + +The `_query()` method is a shorthand for creating an `ArangoStatement` object, +executing it and iterating over the resulting cursor. If more control over the +result set iteration is needed, it is recommended to first create an +`ArangoStatement` object as follows: + +```js +--- +name: 04_workWithAQL_statements1 +description: '' +--- +stmt = db._createStatement( { "query": "FOR i IN [ 1, 2 ] RETURN i * 2" } ); +``` + +To execute the query, use the `execute()` method of the _statement_ object: + +```js +--- +name: 05_workWithAQL_statements2 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ 1, 2 ] RETURN i * 2" } ); +cursor = stmt.execute(); +``` + +You can pass a number to the `execute()` method to specify a batch size value. +The server returns at most this many results in one roundtrip. +The batch size cannot be adjusted after the query is first executed. + +**Note**: There is no need to explicitly call the execute method if another +means of fetching the query results is chosen. The following two approaches +lead to the same result: + +```js +--- +name: executeQueryNoBatchSize +description: '' +--- +~db._create("users"); +~db.users.save({ name: "Gerhard" }); +~db.users.save({ name: "Helmut" }); +~db.users.save({ name: "Angela" }); +var result = db.users.all().toArray(); +print(result); + +var q = db._query("FOR x IN users RETURN x"); +result = [ ]; +while (q.hasNext()) { + result.push(q.next()); +} +print(result); +~db._drop("users") +``` + +The following two alternatives both use a batch size and return the same +result: + +```js +--- +name: executeQueryBatchSize +description: '' +--- +~db._create("users"); +~db.users.save({ name: "Gerhard" }); +~db.users.save({ name: "Helmut" }); +~db.users.save({ name: "Angela" }); +var result = [ ]; +var q = db.users.all(); +q.execute(1); +while(q.hasNext()) { + result.push(q.next()); +} +print(result); + +result = [ ]; +q = db._query("FOR x IN users RETURN x", {}, { batchSize: 1 }); +while (q.hasNext()) { + result.push(q.next()); +} +print(result); +~db._drop("users") +``` + +### Cursors + +Once the query executed the query results are available in a cursor. +The cursor can return all its results at once using the `toArray()` method. +This is a short-cut that you can use if you want to access the full result +set without iterating over it yourself. + +```js +--- +name: 05_workWithAQL_statements3 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ 1, 2 ] RETURN i * 2" } ); +~var cursor = stmt.execute(); +cursor.toArray(); +``` + +Cursors can also be used to iterate over the result set document-by-document. +To do so, use the `hasNext()` and `next()` methods of the cursor: + +```js +--- +name: 05_workWithAQL_statements4 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ 1, 2 ] RETURN i * 2" } ); +~var c = stmt.execute(); +while (c.hasNext()) { + require("@arangodb").print(c.next()); +} +``` + +Please note that you can iterate over the results of a cursor only once, and that +the cursor will be empty when you have fully iterated over it. To iterate over +the results again, the query needs to be re-executed. + +Additionally, the iteration can be done in a forward-only fashion. There is no +backwards iteration or random access to elements in a cursor. + +### ArangoStatement parameters binding + +To execute an AQL query using bind parameters, you need to create a statement first +and then bind the parameters to it before execution: + +```js +--- +name: 05_workWithAQL_statements5 +description: '' +--- +var stmt = db._createStatement( { "query": "FOR i IN [ @one, @two ] RETURN i * 2" } ); +stmt.bind("one", 1); +stmt.bind("two", 2); +cursor = stmt.execute(); +``` + +The cursor results can then be dumped or iterated over as usual, e.g.: + +```js +--- +name: 05_workWithAQL_statements6 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ @one, @two ] RETURN i * 2" } ); +~stmt.bind("one", 1); +~stmt.bind("two", 2); +~var cursor = stmt.execute(); +cursor.toArray(); +``` + +or + +```js +--- +name: 05_workWithAQL_statements7 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ @one, @two ] RETURN i * 2" } ); +~stmt.bind("one", 1); +~stmt.bind("two", 2); +~var cursor = stmt.execute(); +while (cursor.hasNext()) { + require("@arangodb").print(cursor.next()); +} +``` + +Please note that bind parameters can also be passed into the `_createStatement()` +method directly, making it a bit more convenient: + +```js +--- +name: 05_workWithAQL_statements8 +description: '' +--- +stmt = db._createStatement({ + "query": "FOR i IN [ @one, @two ] RETURN i * 2", + "bindVars": { + "one": 1, + "two": 2 + } +}); +``` + +### Counting with a cursor + +Cursors also optionally provide the total number of results. By default, they do not. +To make the server return the total number of results, you may set the `count` attribute to +`true` when creating a statement: + +```js +--- +name: 05_workWithAQL_statements9 +description: '' +--- +stmt = db._createStatement( { + "query": "FOR i IN [ 1, 2, 3, 4 ] RETURN i", + "count": true } ); +``` + +After executing this query, you can use the `count` method of the cursor to get the +number of total results from the result set: + +```js +--- +name: 05_workWithAQL_statements10 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ 1, 2, 3, 4 ] RETURN i", "count": true } ); +var cursor = stmt.execute(); +cursor.count(); +``` + +Please note that the `count` method returns nothing if you did not specify the `count` +attribute when creating the query. + +This is intentional so that the server may apply optimizations when executing the query and +construct the result set incrementally. Incremental creation of the result sets +is no possible +if all of the results need to be shipped to the client anyway. Therefore, the client +has the choice to specify `count` and retrieve the total number of results for a query (and +disable potential incremental result set creation on the server), or to not retrieve the total +number of results and allow the server to apply optimizations. + +Please note that at the moment the server will always create the full result set for each query so +specifying or omitting the `count` attribute currently does not have any impact on query execution. +This may change in the future. Future versions of ArangoDB may create result sets incrementally +on the server-side and may be able to apply optimizations if a result set is not fully fetched by +a client. + +### Using cursors to obtain additional information on internal timings + +Cursors can also optionally provide statistics of the internal execution phases. By default, they do not. +To get to know how long parsing, optimization, instantiation and execution took, +make the server return that by setting the `profile` attribute to +`true` when creating a statement: + +```js +--- +name: 06_workWithAQL_statements11 +description: '' +--- +stmt = db._createStatement({ + query: "FOR i IN [ 1, 2, 3, 4 ] RETURN i", + options: {"profile": true}}); +``` + +After executing this query, you can use the `getExtra()` method of the cursor to get the +produced statistics: + +```js +--- +name: 06_workWithAQL_statements12 +description: '' +--- +~var stmt = db._createStatement( { "query": "FOR i IN [ 1, 2, 3, 4 ] RETURN i", options: {"profile": true}} ); +var cursor = stmt.execute(); +cursor.getExtra(); +``` + +## Query validation with `db._parse()` + +The `_parse()` method of the `db` object can be used to parse and validate a +query syntactically, without actually executing it. + +```js +--- +name: 06_workWithAQL_statements13 +description: '' +--- +db._parse( "FOR i IN [ 1, 2 ] RETURN i" ); +``` diff --git a/site/content/arangodb/oem/aql/how-to-invoke-aql/with-the-web-interface.md b/site/content/arangodb/oem/aql/how-to-invoke-aql/with-the-web-interface.md new file mode 100644 index 0000000000..abb1e651e2 --- /dev/null +++ b/site/content/arangodb/oem/aql/how-to-invoke-aql/with-the-web-interface.md @@ -0,0 +1,50 @@ +--- +title: Executing AQL queries in the ArangoDB web interface +menuTitle: with the Web Interface +weight: 10 +description: >- + You can run ad-hoc AQL queries using the query editor in the web interface +--- +In the **QUERIES** section of the web interface, type in a query in the main box +and execute it by clicking the **Execute** button. The query result is displayed +below the editor. + +The editor provides a few example queries that you can use as templates. +It also provides a feature to explain a query and inspect its execution plan +by clicking the **Explain** button. + +Bind parameters can be defined in the right-hand side pane. The format is the +same as used for bind parameters in the HTTP REST API and in (JavaScript) +application code. + +Here is an example: + +```aql +FOR doc IN @@collection + FILTER CONTAINS(LOWER(doc.author), @search, false) + RETURN { "name": doc.name, "descr": doc.description, "author": doc.author } +``` + +Bind parameters (table view mode): + +| Key | Value | +|-------------|--------| +| @collection | _apps | +| search | arango | + +Bind parameters (JSON view mode): + +```json +{ + "@collection": "_apps", + "search": "arango" +} +``` + +How bind parameters work can be found in [AQL Fundamentals](../fundamentals/bind-parameters.md). + +Queries can also be saved in the AQL editor along with their bind parameter values +for later reuse. This data is stored in the user profile in the current database +(in the `_users` system collection). + +Also see the detailed description of the [Web Interface](../../components/web-interface/_index.md). diff --git a/site/content/arangodb/oem/aql/operators.md b/site/content/arangodb/oem/aql/operators.md new file mode 100644 index 0000000000..f3bef216f7 --- /dev/null +++ b/site/content/arangodb/oem/aql/operators.md @@ -0,0 +1,816 @@ +--- +title: Operators +menuTitle: Operators +weight: 15 +description: >- + AQL supports a number of operators that can be used in expressions, + such as for arithmetic, comparing values, and logically combining conditions +--- +## Comparison operators + +Comparison (or relational) operators compare two operands. They can be used with +any input data types, and return a boolean result value. + +The following comparison operators are supported: + +| Operator | Description +|:-----------|:----------- +| `==` | equality +| `!=` | inequality +| `<` | less than +| `<=` | less or equal +| `>` | greater than +| `>=` | greater or equal +| `IN` | test if a value is contained in an array +| `NOT IN` | test if a value is not contained in an array +| `LIKE` | tests if a string value matches a pattern +| `NOT LIKE` | tests if a string value does not match a pattern +| `=~` | tests if a string value matches a regular expression +| `!~` | tests if a string value does not match a regular expression + +Each of the comparison operators returns a boolean value if the comparison can +be evaluated and returns *true* if the comparison evaluates to true, and *false* +otherwise. + +The comparison operators accept any data types for the first and second +operands. However, `IN` and `NOT IN` only return a meaningful result if +their right-hand operand is an array. `LIKE` and `NOT LIKE` only execute +if both operands are string values. All four operators do not perform +implicit type casts if the compared operands have different types, i.e. +they test for strict equality or inequality (`0` is different to `"0"`, +`[0]`, `false` and `null` for example). + +```aql + 0 == null // false + 1 > 0 // true + true != null // true + 45 <= "yikes!" // true + 65 != "65" // true + 65 == 65 // true + 1.23 > 1.32 // false + 1.5 IN [ 2, 3, 1.5 ] // true + "foo" IN null // false +42 NOT IN [ 17, 40, 50 ] // true + "abc" == "abc" // true + "abc" == "ABC" // false + "foo" LIKE "f%" // true + "foo" NOT LIKE "f%" // false + "foo" =~ "^f[o].$" // true + "foo" !~ "[a-z]+bar$" // true +``` + +The `LIKE` operator checks whether its left operand matches the pattern specified +in its right operand. The pattern can consist of regular characters and wildcards. +The supported wildcards are `_` to match a single arbitrary character, and `%` to +match any number of arbitrary characters. Literal `%` and `_` need to be escaped +with a backslash. Backslashes need to be escaped themselves, which effectively +means that two reverse solidus characters need to precede a literal percent sign +or underscore. In arangosh, additional escaping is required, making it four +backslashes in total preceding the to-be-escaped character. + +```aql + "abc" LIKE "a%" // true + "abc" LIKE "_bc" // true +"a_b_foo" LIKE "a\\_b\\_foo" // true +``` + +The pattern matching performed by the `LIKE` operator is case-sensitive. + +The `NOT LIKE` operator has the same characteristics as the `LIKE` operator +but with the result negated. It is thus identical to `NOT (… LIKE …)`. Note +the parentheses, which are necessary for certain expressions: + +```aql +FOR doc IN coll + RETURN NOT doc.attr LIKE "…" +``` + +The return expression gets transformed into `LIKE(!doc.attr, "…")`, leading +to unexpected results. `NOT(doc.attr LIKE "…")` gets transformed into the +more reasonable `! LIKE(doc.attr, "…")`. + +The regular expression operators `=~` and `!~` expect their left-hand operands to +be strings, and their right-hand operands to be strings containing valid regular +expressions as specified in the documentation for the AQL function +[`REGEX_TEST()`](functions/string.md#regex_test). + +## Array comparison operators + +Most comparison operators also exist as an *array variant*. In the array variant, +a `==`, `!=`, `>`, `>=`, `<`, `<=`, `IN`, or `NOT IN` operator is prefixed with +an `ALL`, `ANY`, or `NONE` keyword. This changes the operator's behavior to +compare the individual array elements of the left-hand argument to the right-hand +argument. Depending on the quantifying keyword, all, any, or none of these +comparisons need to be satisfied to evaluate to `true` overall. + +You can also combine one of the supported comparison operators with the special +`AT LEAST ()` operator to require an arbitrary number of elements +to satisfy the condition to evaluate to `true`. You can use a static number or +calculate it dynamically using an expression. + +```aql +[ 1, 2, 3 ] ALL IN [ 2, 3, 4 ] // false +[ 1, 2, 3 ] ALL IN [ 1, 2, 3 ] // true +[ 1, 2, 3 ] NONE IN [ 3 ] // false +[ 1, 2, 3 ] NONE IN [ 23, 42 ] // true +[ 1, 2, 3 ] ANY IN [ 4, 5, 6 ] // false +[ 1, 2, 3 ] ANY IN [ 1, 42 ] // true +[ 1, 2, 3 ] ANY == 2 // true +[ 1, 2, 3 ] ANY == 4 // false +[ 1, 2, 3 ] ANY > 0 // true +[ 1, 2, 3 ] ANY <= 1 // true +[ 1, 2, 3 ] NONE < 99 // false +[ 1, 2, 3 ] NONE > 10 // true +[ 1, 2, 3 ] ALL > 2 // false +[ 1, 2, 3 ] ALL > 0 // true +[ 1, 2, 3 ] ALL >= 3 // false +["foo", "bar"] ALL != "moo" // true +["foo", "bar"] NONE == "bar" // false +["foo", "bar"] ANY == "foo" // true + +[ 1, 2, 3 ] AT LEAST (2) IN [ 2, 3, 4 ] // true +["foo", "bar"] AT LEAST (1+1) == "foo" // false +``` + +Note that these operators do not utilize indexes in regular queries. +The operators are also supported in [SEARCH expressions](high-level-operations/search.md), +where ArangoSearch's indexes can be utilized. The semantics differ however, see +[AQL `SEARCH` operation](high-level-operations/search.md#array-comparison-operators). + +## Logical operators + +The following logical operators are supported in AQL: + +- `&&` logical and operator +- `||` logical or operator +- `!` logical not/negation operator + +AQL also supports the following alternative forms for the logical operators: + +- `AND` logical and operator +- `OR` logical or operator +- `NOT` logical not/negation operator + +The alternative forms are aliases and functionally equivalent to the regular +operators. + +The two-operand logical operators in AQL are executed with short-circuit +evaluation (except if one of the operands is or includes a subquery. In this +case the subquery is pulled out an evaluated before the logical operator). + +The result of the logical operators in AQL is defined as follows: + +- `lhs && rhs` returns `lhs` if it is `false` or would be `false` when converted + to a boolean. If `lhs` is `true` or would be `true` when converted to a boolean, + `rhs` is returned. +- `lhs || rhs` returns `lhs` if it is `true` or would be `true` when converted + to a boolean. If `lhs` is `false` or would be `false` when converted to a boolean, + `rhs` is returned. +- `! value` returns the negated value of `value` converted to a boolean + +```aql +u.age > 15 && u.address.city != "" +true || false +NOT u.isInvalid +1 || ! 0 +``` + +Passing non-boolean values to a logical operator is allowed. Any non-boolean operands +are casted to boolean implicitly by the operator, without making the query abort. + +The *conversion to a boolean value* works as follows: +- `null` is converted to `false` +- boolean values remain unchanged +- all numbers unequal to zero are `true`, zero is `false` +- an empty string is `false`, all other strings are `true` +- arrays (`[ ]`) and objects / documents (`{ }`) are `true`, regardless of their contents + +The result of *logical and* and *logical or* operations can now have any data +type and is not necessarily a boolean value. + +For example, the following logical operations return boolean values: + +```aql +25 > 1 && 42 != 7 // true +22 IN [ 23, 42 ] || 23 NOT IN [ 22, 7 ] // true +25 != 25 // false +``` + +… whereas the following logical operations do not return boolean values: + +```aql + 1 || 7 // 1 +null || "foo" // "foo" +null && true // null +true && 23 // 23 +``` + +## Arithmetic operators + +Arithmetic operators perform an arithmetic operation on two numeric +operands. The result of an arithmetic operation is again a numeric value. + +AQL supports the following arithmetic operators: + +- `+` addition +- `-` subtraction +- `*` multiplication +- `/` division +- `%` modulus + +Unary plus and unary minus are supported as well: + +```aql +LET x = -5 +LET y = 1 +RETURN [-x, +y] +// [5, 1] +``` + +For exponentiation, there is a [numeric function](functions/numeric.md#pow) `POW()`. +The syntax `base ** exp` is not supported. + +For string concatenation, you must use the [`CONCAT()` string function](functions/string.md#concat). +Combining two strings with a plus operator (`"foo" + "bar"`) does not work! +Also see [Common Errors](common-errors.md). + +```aql +1 + 1 +33 - 99 +12.4 * 4.5 +13.0 / 0.1 +23 % 7 +-15 ++9.99 +``` + +The arithmetic operators accept operands of any type. Passing non-numeric values to an +arithmetic operator casts the operands to numbers using the type casting rules +applied by the [`TO_NUMBER()`](functions/type-check-and-cast.md#to_number) function: + +- `null` is converted to `0` +- `false` is converted to `0`, `true` is converted to `1` +- a valid numeric value remains unchanged, but NaN and Infinity are converted to `0` +- string values are converted to a number if they contain a valid string representation + of a number. Any whitespace at the start or the end of the string is ignored. Strings + with any other contents are converted to the number `0` +- an empty array is converted to `0`, an array with one member is converted to the numeric + representation of its sole member. Arrays with more members are converted to the number + `0`. +- objects / documents are converted to the number `0`. + +An arithmetic operation that produces an invalid value, such as `1 / 0` +(division by zero), produces a result value of `null`. The query is not +aborted, but you may see a warning. + +```aql + 1 + "a" // 1 + 1 + "99" // 100 + 1 + null // 1 +null + 1 // 1 + 3 + [ ] // 3 + 24 + [ 2 ] // 26 + 24 + [ 2, 4 ] // 24 + 25 - null // 25 + 17 - true // 16 + 23 * { } // 0 + 5 * [ 7 ] // 35 + 24 / "12" // 2 + 1 / 0 // null (with a 'division by zero' warning) +``` + +## Ternary operator + +AQL also supports a ternary operator that can be used for conditional +evaluation. The ternary operator expects a boolean condition as its first +operand, and it returns the result of the second operand if the condition +evaluates to true, and the third operand otherwise. +You may use [subqueries](fundamentals/subqueries.md) as operands. + +In the following example, the expression returns `u.userId` if `u.age` is +greater than 15 or if `u.active` is `true`. Otherwise it returns `null`: + +```aql +u.age > 15 || u.active == true ? u.userId : null +``` + +There is also a shortcut variant of the ternary operator with just two +operands. This variant can be used if the expression for the boolean +condition and the return value should be the same. + +In the following example, the expression evaluates to `u.value` if `u.value` is +truthy. Otherwise, a fixed string is given back: + +```aql +u.value ? : 'value is null, 0 or not present' +``` + +The condition (here just `u.value`) is only evaluated once if the second +operand between `?` and `:` is omitted, whereas it would be evaluated twice +in case of `u.value ? u.value : 'value is null'`. + +{{< info >}} +Subqueries that are used inside expressions are pulled out of these +expressions and executed beforehand. That means that subqueries do not +participate in lazy evaluation of operands, for example, in the +ternary operator. Also see +[evaluation of subqueries](fundamentals/subqueries.md#evaluation-of-subqueries). +{{< /info >}} + +## Range operator + +AQL supports expressing simple numeric ranges with the `..` operator. +This operator can be used to easily iterate over a sequence of numeric +values. + +The `..` operator produces an array of the integer values in the +defined range, with both bounding values included. + +```aql +2010..2013 +``` + +The above example produces the following result: + +```json +[ 2010, 2011, 2012, 2013 ] +``` + +Using the range operator is equivalent to writing an array with the integer +values in the range specified by the bounds of the range. If the bounds of +the range operator are non-integers, they are converted to integer values first. + +There is also a [`RANGE()` function](functions/numeric.md#range). + +## Array operators + +AQL provides different array operators: + +- `[n]` to [access the array element](#indexed-value-access) at index `n` +- `[*]` for [expanding array variables](#array-expansion) +- `[**]`, `[***]` etc. for [flattening arrays](#array-contraction) +- `[* ...]`, `[** ...]` etc. for filtering, limiting, and projecting arrays using + [inline expressions](#inline-expressions) +- `[? ...]` for nested search, known as the [question mark operator](#question-mark-operator) + +### Indexed value access + +You can access individual array elements by their position using the `[]` accessor. +The position is called the *index* and starts at `0`. + +When specifying an index, use a numeric integer value. You can use negative +index values to access array elements starting from the end of the array. +This is convenient if the length of the array is unknown and you want to access +elements at the end of the array. + +You can also use an expression and calculate the index of an element. + +{{< info >}} +If you try to access an array element with an out-of-bounds index (after the last +element or before the first element), the result is a `null` value without +raising an error or warning. +{{< /info >}} + +```aql +LET friends = [ "tina", "helga", "alfred" ] + +friends[0] // access 1st array element (elements start at index 0) +friends[2] // access 3rd array element + +friends[-1] // access last array element +friends[-2] // access second to last array element + +friends[LENGTH(friends) / 2] // access array element in the middle (floored) +``` + +### Array expansion + +In order to access a named attribute from all elements in an array easily, AQL +offers the shortcut operator `[*]` for array variable expansion. + +Using the `[*]` operator with an array variable will iterate over all elements +in the array, thus allowing to access a particular attribute of each element. It is +required that the expanded variable is an array. The result of the `[*]` +operator is again an array. + +To demonstrate the array expansion operator, let's go on with the following three +example *users* documents: + +```json +[ + { + "name": "john", + "age": 35, + "friends": [ + { "name": "tina", "age": 43 }, + { "name": "helga", "age": 52 }, + { "name": "alfred", "age": 34 } + ] + }, + { + "name": "yves", + "age": 24, + "friends": [ + { "name": "sergei", "age": 27 }, + { "name": "tiffany", "age": 25 } + ] + }, + { + "name": "sandra", + "age": 40, + "friends": [ + { "name": "bob", "age": 32 }, + { "name": "elena", "age": 48 } + ] + } +] +``` + +With the `[*]` operator it becomes easy to query just the names of the +friends for each user: + +```aql +FOR u IN users + RETURN { name: u.name, friends: u.friends[*].name } +``` + +This will produce: + +```json +[ + { "name" : "john", "friends" : [ "tina", "helga", "alfred" ] }, + { "name" : "yves", "friends" : [ "sergei", "tiffany" ] }, + { "name" : "sandra", "friends" : [ "bob", "elena" ] } +] +``` + +This is a shortcut for the longer, semantically equivalent query: + +```aql +FOR u IN users + RETURN { name: u.name, friends: (FOR f IN u.friends RETURN f.name) } +``` + +### Array contraction + +In order to collapse (or flatten) results in nested arrays, AQL provides the `[**]` +operator. It works similar to the `[*]` operator, but additionally collapses nested +arrays. + +How many levels are collapsed is determined by the amount of asterisk characters used. +`[**]` collapses one level of nesting - just like `FLATTEN(array)` or `FLATTEN(array, 1)` +would do -, `[***]` collapses two levels - the equivalent to `FLATTEN(array, 2)` - and +so on. + +Let's compare the array expansion operator with an array contraction operator. +For example, the following query produces an array of friend names per user: + +```aql +FOR u IN users + RETURN u.friends[*].name +``` + +As we have multiple users, the overall result is a nested array: + +```json +[ + [ + "tina", + "helga", + "alfred" + ], + [ + "sergei", + "tiffany" + ], + [ + "bob", + "elena" + ] +] +``` + +If the goal is to get rid of the nested array, we can apply the `[**]` operator on the +result. But simply appending `[**]` to the query won't help, because *u.friends* +is not a nested (multi-dimensional) array, but a simple (one-dimensional) array. Still, +the `[**]` can be used if it has access to a multi-dimensional nested result. + +We can extend above query as follows and still create the same nested result: + +```aql +RETURN ( + FOR u IN users RETURN u.friends[*].name +) +``` + +By now appending the `[**]` operator at the end of the query... + +```aql +RETURN ( + FOR u IN users RETURN u.friends[*].name +)[**] +``` + +... the query result becomes: + +```json +[ + [ + "tina", + "helga", + "alfred", + "sergei", + "tiffany", + "bob", + "elena" + ] +] +``` + +Note that the elements are not de-duplicated. For a flat array with only unique +elements, a combination of [`UNIQUE()`](functions/array.md#unique) and +[`FLATTEN()`](functions/array.md#flatten) is advisable. + +### Inline expressions + +It is possible to filter elements while iterating over an array, to limit the amount +of returned elements and to create a projection using the current array element. +Sorting is not supported by this shorthand form. + +These inline expressions can follow array expansion and contraction operators +`[* ...]`, `[** ...]` etc. The keywords `FILTER`, `LIMIT` and `RETURN` +must occur in this order if they are used in combination, and can only occur once: + +anyArray[* FILTER conditions LIMIT skip,limit RETURN projection] + +Example with nested numbers and array contraction: + +```aql +LET arr = [ [ 1, 2 ], 3, [ 4, 5 ], 6 ] +RETURN arr[** FILTER CURRENT % 2 == 0] +``` + +All even numbers are returned in a flat array: + +```json +[ + [ 2, 4, 6 ] +] +``` + +Complex example with multiple conditions, limit and projection: + +```aql +FOR u IN users + RETURN { + name: u.name, + friends: u.friends[* FILTER CONTAINS(CURRENT.name, "a") AND CURRENT.age > 40 + LIMIT 2 + RETURN CONCAT(CURRENT.name, " is ", CURRENT.age) + ] + } +``` + +No more than two computed strings based on *friends* with an `a` in their name and +older than 40 years are returned per user: + +```json +[ + { + "name": "john", + "friends": [ + "tina is 43", + "helga is 52" + ] + }, + { + "name": "sandra", + "friends": [ + "elena is 48" + ] + }, + { + "name": "yves", + "friends": [] + } +] +``` + +#### Inline filter + +To return only the names of friends that have an *age* value +higher than the user herself, an inline `FILTER` can be used: + +```aql +FOR u IN users + RETURN { name: u.name, friends: u.friends[* FILTER CURRENT.age > u.age].name } +``` + +The pseudo-variable *CURRENT* can be used to access the current array element. +The `FILTER` condition can refer to *CURRENT* or any variables valid in the +outer scope. + +#### Inline limit + +The number of elements returned can be restricted with `LIMIT`. It works the same +as the [limit operation](high-level-operations/limit.md). `LIMIT` must come after `FILTER` +and before `RETURN`, if they are present. + +```aql +FOR u IN users + RETURN { name: u.name, friends: u.friends[* LIMIT 1].name } +``` + +Above example returns one friend each: + +```json +[ + { "name": "john", "friends": [ "tina" ] }, + { "name": "sandra", "friends": [ "bob" ] }, + { "name": "yves", "friends": [ "sergei" ] } +] +``` + +A number of elements can also be skipped and up to *n* returned: + +```aql +FOR u IN users + RETURN { name: u.name, friends: u.friends[* LIMIT 1,2].name } +``` + +The example query skips the first friend and returns two friends at most +per user: + +```json +[ + { "name": "john", "friends": [ "helga", "alfred" ] }, + { "name": "sandra", "friends": [ "elena" ] }, + { "name": "yves", "friends": [ "tiffany" ] } +] +``` + +#### Inline projection + +To return a projection of the current element, use `RETURN`. If a `FILTER` is +also present, `RETURN` must come later. + +```aql +FOR u IN users + RETURN u.friends[* RETURN CONCAT(CURRENT.name, " is a friend of ", u.name)] +``` + +The above will return: + +```json +[ + [ + "tina is a friend of john", + "helga is a friend of john", + "alfred is a friend of john" + ], + [ + "sergei is a friend of yves", + "tiffany is a friend of yves" + ], + [ + "bob is a friend of sandra", + "elena is a friend of sandra" + ] +] +``` + +### Question mark operator + +You can use the `[? ... ]` operator on arrays to check whether the elements +fulfill certain criteria, and you can specify how often they should be satisfied. +The operator is similar to an inline filter but with an additional length check +and it evaluates to `true` or `false`. + +The following example shows how to check whether two of numbers in the array +are even: + +```aql +LET arr = [ 1, 2, 3, 4 ] +RETURN arr[? 2 FILTER CURRENT % 2 == 0] // true +``` + +The number `2` after the `?` is the quantifier. It is optional and defaults to +`ANY`. The following quantifiers are supported: + +- Integer numbers for exact quantities (e.g. `2`) +- Number ranges for a quantity between the two values (e.g. `2..3`) +- `NONE` (equivalent to `0`) +- `ANY` +- `ALL` +- `AT LEAST` + +The quantifier needs to be followed by a `FILTER` operation if you want to specify +conditions. You can refer to the current array element via the `CURRENT` +pseudo-variable in the filter expression. If you leave out the quantifier and +`FILTER` operation (only `arr[?]`), then `arr` is checked whether it is an array +and if it has at least one element. + +The question mark operator is a shorthand for an inline filter with a +surrounding length check. The following table compares both variants: + +| Question mark operator | Inline filter with length check | +|:-----------------------|:--------------------------------| +| `arr[? FILTER ]` | `LENGTH(arr[* FILTER ]) == ` +| `arr[? .. FILTER ]` | `IN_RANGE(LENGTH(arr[* FILTER ]), , , true, true)` +| `arr[? NONE FILTER ]` | `LENGTH(arr[* FILTER ]) == 0` +| `arr[? ANY FILTER ]` | `LENGTH(arr[* FILTER ]) > 0` +| `arr[? ALL FILTER ]` | `LENGTH(arr[* FILTER ]) == LENGTH(arr)` +| `arr[? AT LEAST () FILTER ]` | `LENGTH(arr[* FILTER ]) >= ` +| `arr[?]` | `LENGTH(arr[*]) > 0` +{.fixed} + +The question mark operator can be used for nested search (Enterprise Edition only): +- [Nested search with ArangoSearch](../index-and-search/arangosearch/nested-search.md) using Views +- Nested search using [Inverted indexes](../index-and-search/indexing/working-with-indexes/inverted-indexes.md#nested-search-enterprise-edition) + +## Object operators + +- `.` and `[expr]` for [accessing an object attribute](#attribute-access) + +### Attribute access + +You can access individual object attributes by their names using the +dot accessor `.` and the square bracket accessor `[]`. + +The dot accessor lets you specify the attribute name as an unquoted string. +This is only possible if the attribute name would be valid as a +[variable name](fundamentals/syntax.md#variable-names). Otherwise, you need to +quote the name with backticks or forward ticks, or use the square bracket accessor. + +You can also use the dot accessor together with a [bind parameter](fundamentals/bind-parameters.md) +to select an attribute or sub-attribute. + +```aql +LET ob = { name: "sandra", "with space": true } + +LET unquoted = ob.name + +LET quoted_1 = ob.`with space` +LET quoted_2 = ob.´with space´ + +LET bindvar = ob.@attr +``` + +The square bracket accessor lets you specify an expression to select an attribute. +This is usually a quoted string literal but you can also calculate the name +dynamically using an arbitrary expression. + +You can also use the square bracket accessor together with a +[bind parameter](fundamentals/bind-parameters.md) to select an attribute. + +```aql +LET ob = { name: "sandra", "with 2 spaces": true } + +LET literal_1 = ob["name"] +LET literal_2 = ob["with 2 spaces"] + +LET attribute = "name" +LET variable = ob[attribute] + +LET expression = ob[CONCAT_SEPARATOR(" ", "with", 1+1, "spaces")] + +LET bindvar = ob[@attr] +``` + +{{< info >}} +If you try to access a non-existing attribute in one way or another, the result +is a `null` value without raising an error or warning. +{{< /info >}} + +## Operator precedence + +The operator precedence in AQL is similar as in other familiar languages +(highest precedence first): + +| Operator(s) | Description +|:---------------------|:----------- +| `::` | scope (user-defined AQL functions) +| `[*]` | array expansion +| `[]` | indexed value access (arrays), attribute access (objects) +| `.` | attribute access (objects) +| `()` | function call +| `!`, `NOT`, `+`, `-` | unary not (logical negation), unary plus, unary minus +| `*`, `/`, `%` | multiplication, division, modulus +| `+`, `-` | addition, subtraction +| `..` | range operator +| `<`, `<=`, `>=`, `>` | less than, less equal, greater equal, greater than +| `IN`, `NOT IN` | in operator, not in operator +| `==`, `!=`, `LIKE`, `NOT LIKE`, `=~`, `!~` | equality, inequality, wildcard match, wildcard non-match, regex match, regex non-match +| `AT LEAST` | at least modifier (array comparison operator, question mark operator) +| `OUTBOUND`, `INBOUND`, `ANY`, `ALL`, `NONE` | graph traversal directions, array comparison operators, question mark operator +| `&&`, `AND` | logical and +| `\|\|`, `OR` | logical or +| `INTO` | into operator (INSERT / UPDATE / REPLACE / REMOVE / COLLECT operations) +| `WITH` | with operator (WITH / UPDATE / REPLACE / COLLECT operations) +| `=` | variable assignment (LET / COLLECT operations, AGGREGATE / PRUNE clauses) +| `?`, `:` | ternary operator, object literals +| `DISTINCT` | distinct modifier (RETURN operations) +| `,` | comma separator + +The parentheses `(` and `)` can be used to enforce a different operator +evaluation order. diff --git a/site/content/arangodb/oem/aql/user-defined-functions.md b/site/content/arangodb/oem/aql/user-defined-functions.md new file mode 100644 index 0000000000..78376f0d24 --- /dev/null +++ b/site/content/arangodb/oem/aql/user-defined-functions.md @@ -0,0 +1,405 @@ +--- +title: Extending AQL with user-defined functions +menuTitle: User-defined Functions +weight: 45 +description: >- + You can write UDFs in JavaScript to extend AQL or to simplify queries +--- +AQL comes with a [built-in set of functions](functions/_index.md), but it is +not a fully-featured programming language. To add missing functionality or to +simplify queries, you may write your own user-defined functions (**UDFs**) in +JavaScript and make them available in AQL. + +## Known Limitations + +{{< warning >}} +UDFs can have serious effects on the performance of your queries and the resource +usage in ArangoDB. Especially in cluster setups they should not be used against +much data, because this data will need to be sent over the network back and forth +between _DB-Servers_ and _Coordinators_, potentially adding a lot of latency. +This can be mitigated by very selective `FILTER`s before calls to UDFs. +{{< /warning >}} + +Since the optimizer doesn't know anything about the nature of your function, +**the optimizer can't use indexes for UDFs**. So you should never lean on a UDF +as the primary criterion for a `FILTER` statement to reduce your query result set. +Instead, put a another `FILTER` statement in front of it. You should make sure +that this [**`FILTER` statement** is effective](execution-and-performance/query-optimization.md) +to reduce the query result before passing it to your UDF. + +Rule of thumb is, the closer the UDF is to your final `RETURN` statement +(or maybe even inside it), the better. + +When used in clusters, UDFs are always executed on a +[Coordinator](../deploy/cluster/_index.md). +It is not possible to execute UDFs on DB-Servers, as no JavaScript execution +engine is available on DB-Servers. Queries that would push UDF execution to +DB-Servers are aborted with a parse error. This includes using UDFs in traversal +`PRUNE` conditions, as well as `FILTER` conditions that can be moved into the +traversal execution on a DB-Server. These limitations also apply to the +single server deployment mode to keep the differences to cluster deployments minimal. + +As UDFs are written in JavaScript, each query that executes a UDF will acquire +one V8 context to execute the UDFs in it. V8 contexts can be re-used across subsequent +queries, but when UDF-invoking queries run in parallel, they will each require a +dedicated V8 context. + +Because UDFs use the V8 JavaScript engine, the engine's default memory limit of 512 MB is applied. + +Using UDFs in clusters may thus result in a higher resource allocation +in terms of used V8 contexts and server threads. If you run out +of these resources, your query may abort with a +[**cluster backend unavailable**](../develop/error-codes.md) error. + +To overcome these mentioned limitations, you may want to increase the +[number of available V8 contexts](../components/arangodb-server/options.md#--javascriptv8-contexts) +(at the expense of increased memory usage), and the +[number of available server threads](../components/arangodb-server/options.md#--servermaximal-threads). + +In addition, modification of global JavaScript variables from inside UDFs is +unsupported, as is reading or changing the data of any collection or running +queries from inside an AQL user function. + +## Naming + +AQL functions that are implemented with JavaScript are always in a namespace. +To register a user-defined AQL function, you need to give it a name with a +namespace. The `::` symbol is used as the namespace separator, for example, +`MYGROUP::MYFUNC`. You can use one or multiple levels of namespaces to create +meaningful function groups. + +The names of user-defined functions are case-insensitive, like all function +names in AQL. + +To refer to and call user-defined functions in AQL queries, you need to use the +fully qualified name with the namespaces: + +```aql +MYGROUP::MYFUNC() +MYFUNCTIONS::MATH::RANDOM() +``` + +ArangoDB's built-in AQL functions are all implemented in C++ and are not in a +namespace, except for the internal `V8()` function, which resides in the `_aql` +namespace. It is the default namespace, which means that you can use the +unqualified name of the function (without `_aql::`) to refer to it. Note that +you cannot add own functions to this namespace. + +## Variables and side effects + +User functions can take any number of input arguments and should +provide one result via a `return` statement. User functions should be kept +purely functional and thus free of side effects and state, and state modification. + +{{< warning >}} +Modification of global variables is unsupported, as is reading or changing +the data of any collection or running queries from inside an AQL user function. +{{< /warning >}} + +User function code is late-bound, and may thus not rely on any variables +that existed at the time of declaration. If user function code requires +access to any external data, it must take care to set up the data by +itself. + +All AQL user function-specific variables should be introduced with the `var`, +`let`, or `const` keywords in order to not accidentally access already defined +variables from outer scopes. Not using a declaration keyword for own variables +may cause side effects when executing the function. + +Here is an example that may modify outer scope variables `i` and `name`, +making the function **not** side-effect free: + +```js +function (values) { + for (i = 0; i < values.length; ++i) { + name = values[i]; + if (name === "foo") { + return i; + } + } + return null; +} +``` + +The above function can be made free of side effects by using the `var`, `let`, +or `const` keywords, so the variables become function-local variables: + +```js +function (values) { + for (let i = 0; i < values.length; ++i) { + let name = values[i]; + if (name === "foo") { + return i; + } + } + return null; +} +``` + +## Input parameters + +In order to return a result, a user function should use a `return` instruction +rather than modifying its input parameters. + +AQL user functions are allowed to modify their input parameters for input +parameters that are null, boolean, numeric or string values. Modifying these +input parameter types inside a user function should be free of side effects. +However, user functions should not modify input parameters if the parameters are +arrays or objects and as such passed by reference, as that may modify variables +and state outside of the user function itself. + +## Return values + +User functions must only return primitive types (i.e. `null`, boolean +values, numeric values, string values) or aggregate types (arrays or +objects) composed of these types. +Returning any other JavaScript object type (Function, Date, RegExp etc.) from +a user function may lead to undefined behavior and should be avoided. + +## Enforcing strict mode + +By default, any user function code is executed in *sloppy mode*. In order to +make a user function run in strict mode, use `"use strict"` explicitly inside +the user function: + +```js +function (values) { + "use strict" + + for (let i = 0; i < values.length; ++i) { + let name = values[i]; + if (name === "foo") { + return i; + } + } + return null; +} +``` + +Any violation of the strict mode triggers a runtime error. + +## Registering and unregistering user functions + +User-defined functions (UDFs) can be registered in the selected database +using the `@arangodb/aql/functions` module as follows: + +```js +var aqlfunctions = require("@arangodb/aql/functions"); +``` + +To register a function, the fully qualified function name plus the +function code must be specified. This can easily be done in +[arangosh](../components/tools/arangodb-shell/_index.md). The +[HTTP Interface](../develop/http-api/queries/user-defined-aql-functions.md) also offers +User Functions management. + +In a cluster setup, make sure to connect to a Coordinator to manage the UDFs. + +Documents in the `_aqlfunctions` collection (or any other system collection) +should not be accessed directly, but only via the dedicated interfaces. +Otherwise you might see caching issues or accidentally break something. +The interfaces ensure the correct format of the documents and invalidate +the UDF cache. + +### Registering an AQL user function + +For testing, it may be sufficient to directly type the function code in the shell. +To manage more complex code, you may write it in the code editor of your choice +and save it as file. For example: + +```js +/* path/to/file.js */ +'use strict'; + +function greeting(name) { + if (name === undefined) { + name = "World"; + } + return `Hello ${name}!`; +} + +module.exports = greeting; +``` + +Then require it in the shell in order to register a user-defined function: + +```js +arangosh> var func = require("path/to/file.js"); +arangosh> aqlfunctions.register("HUMAN::GREETING", func, true); +``` + +Note that a return value of `false` means that the function `HUMAN::GREETING` +was newly created, and not that it failed to register. `true` is returned +if a function of that name existed before and was just updated. + +`aqlfunctions.register(name, code, isDeterministic)` + +Registers an AQL user function, identified by a fully qualified function +name. The function code in `code` must be specified as a JavaScript +function or a string representation of a JavaScript function. +If the function code in `code` is passed as a string, it is required that +the string evaluates to a JavaScript function definition. + +If a function identified by `name` already exists, the previous function +definition is updated. Please also make sure that the function code +does not violate the conventions for AQL functions, in particular with regards +to the [naming](#naming) and [side-effects](#variables-and-side-effects). + +The `isDeterministic` attribute can be used to specify whether the +function results are fully deterministic (i.e. depend solely on the input +and are the same for repeated calls with the same input values). It is not +used at the moment but may be used for optimizations later. + +The registered function is stored in the selected database's system +collection `_aqlfunctions`. + +The function returns `true` when it updates/replaces an existing AQL +function of the same name, and `false` otherwise. It throws an exception +if it detects syntactically invalid function code. + +**Examples** + +```js +require("@arangodb/aql/functions").register("MYFUNCTIONS::TEMPERATURE::CELSIUSTOFAHRENHEIT", +function (celsius) { + return celsius * 1.8 + 32; +}); +``` + +The function code is not executed in *strict mode* or *strong mode* by +default. In order to make a user function being run in strict mode, use +`use strict` explicitly, e.g.: + +```js +require("@arangodb/aql/functions").register("MYFUNCTIONS::TEMPERATURE::CELSIUSTOFAHRENHEIT", +function (celsius) { + "use strict"; + return celsius * 1.8 + 32; +}); +``` + +You can access the name under which the AQL function is registered by accessing +the `name` property of `this` inside the JavaScript code: + +```js +require("@arangodb/aql/functions").register("MYFUNCTIONS::TEMPERATURE::CELSIUSTOFAHRENHEIT", +function (celsius) { + "use strict"; + if (typeof celsius === "undefined") { + const error = require("@arangodb").errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH; + AQL_WARNING(error.code, require("util").format(error.message, this.name, 1, 1)); + } + return celsius * 1.8 + 32; +}); +``` + +`AQL_WARNING()` is automatically available to the code of user-defined +functions. The error code and message is retrieved via `@arangodb` module. +The *argument number mismatch* message has placeholders, which we can substitute +using [format()](http://nodejs.org/api/util.html): + +``` +invalid number of arguments for function '%s()', expected number of arguments: minimum: %d, maximum: %d +``` + +In the example above, `%s` is replaced by `this.name` (the AQL function name), +and both `%d` placeholders by `1` (number of expected arguments). If you call +the function without an argument, you see this: + +```js +arangosh> db._query("RETURN MYFUNCTIONS::TEMPERATURE::CELSIUSTOFAHRENHEIT()") +[object ArangoQueryCursor, count: 1, hasMore: false, warning: 1541 - invalid +number of arguments for function 'MYFUNCTIONS::TEMPERATURE::CELSIUSTOFAHRENHEIT()', +expected number of arguments: minimum: 1, maximum: 1] + +[ + null +] +``` + +### Deleting an existing AQL user function + +`aqlfunctions.unregister(name)` + +Unregisters an existing AQL user function, identified by the fully qualified +function name. + +Trying to unregister a function that does not exist results in an +exception. + +**Examples** + +```js +require("@arangodb/aql/functions").unregister("MYFUNCTIONS::TEMPERATURE::CELSIUSTOFAHRENHEIT"); +``` + +### Unregister group + +Delete a group of AQL user functions: + +`aqlfunctions.unregisterGroup(prefix)` + +Unregisters a group of AQL user function, identified by a common function +group prefix. + +This returns the number of functions unregistered. + +**Examples** + +```js +require("@arangodb/aql/functions").unregisterGroup("MYFUNCTIONS::TEMPERATURE"); + +require("@arangodb/aql/functions").unregisterGroup("MYFUNCTIONS"); +``` + +### Listing all AQL user functions + +`aqlfunctions.toArray()` + +Returns all previously registered AQL user functions, with their fully +qualified names and function code. + +--- + +`aqlfunctions.toArray(prefix)` + +Returns all previously registered AQL user functions, restricted to a specified +group of functions by specifying a group prefix. + +**Examples** + +To list all available user functions: + +```js +require("@arangodb/aql/functions").toArray(); +``` + +To list all available user functions in the *MYFUNCTIONS* namespace: + +```js +require("@arangodb/aql/functions").toArray("MYFUNCTIONS"); +``` + +To list all available user functions in the *MYFUNCTIONS::TEMPERATURE* namespace: + +```js +require("@arangodb/aql/functions").toArray("MYFUNCTIONS::TEMPERATURE"); +``` + +## Deployment Details + +Internally, UDFs are stored in a system collection named `_aqlfunctions` +of the selected database. When an AQL statement refers to such a UDF, +it is loaded from that collection. The UDFs will be exclusively +available for queries in that particular database. + +Since the Coordinator doesn't have own local collections, the `_aqlfunctions` +collection is sharded across the cluster. Therefore (as usual), it has to be +accessed through a Coordinator - you mustn't talk to the shards directly. +Once it is in the `_aqlfunctions` collection, it is available on all +Coordinators without additional effort. + +Keep in mind that system collections are excluded from dumps created with +[arangodump](../components/tools/arangodump/_index.md) by default. +To include AQL UDF in a dump, the dump needs to be started with +the option *--include-system-collections true*. diff --git a/site/content/arangodb/oem/components/_index.md b/site/content/arangodb/oem/components/_index.md new file mode 100644 index 0000000000..e5da4f23ad --- /dev/null +++ b/site/content/arangodb/oem/components/_index.md @@ -0,0 +1,6 @@ +--- +title: Components +menuTitle: Components +weight: 165 +description: '' +--- diff --git a/site/content/arangodb/oem/components/arangodb-server/_index.md b/site/content/arangodb/oem/components/arangodb-server/_index.md new file mode 100644 index 0000000000..44c9d1040c --- /dev/null +++ b/site/content/arangodb/oem/components/arangodb-server/_index.md @@ -0,0 +1,21 @@ +--- +title: ArangoDB Server +menuTitle: ArangoDB Server +weight: 170 +description: >- + The ArangoDB daemon (arangod) is the central server binary that can run in + different modes for a variety of setups like single server and clusters +--- +The ArangoDB server is the core component of ArangoDB. The executable file to +run it is named `arangod`. The `d` stands for daemon. A daemon is a long-running +background process that answers requests for services. + +The server process serves the various client connections to the server via the +TCP/HTTP protocol. It also provides a [web interface](../web-interface/_index.md). + +_arangod_ can run in different modes for a variety of setups like single server +and clusters. It differs between the [Community Edition](../../about/features/community-edition.md) +and [Enterprise Edition](../../about/features/enterprise-edition.md). + +See [Administration](../../operations/administration/_index.md) for server configuration +and [Deploy](../../deploy/_index.md) for operation mode details. diff --git a/site/content/arangodb/oem/components/arangodb-server/environment-variables.md b/site/content/arangodb/oem/components/arangodb-server/environment-variables.md new file mode 100644 index 0000000000..62f73290c9 --- /dev/null +++ b/site/content/arangodb/oem/components/arangodb-server/environment-variables.md @@ -0,0 +1,108 @@ +--- +title: ArangoDB Server environment variables +menuTitle: Environment variables +weight: 15 +description: >- + Environment variables used by `arangod` +--- +`arangod` inspects the following list of environment variables: + + - `ARANGODB_OVERRIDE_DETECTED_TOTAL_MEMORY` + + This variable can be used to override the automatic detection of the total + amount of RAM present on the system. One can specify a decimal number + (in bytes). Furthermore, numbers can have the following suffixes: + + - `TB`, `T`, `tb`, `t`: the number is multiplied by 1,000,000,000,000 (terabytes). + - `GB`, `G`, `gb`, `g`: the number is multiplied by 1,000,000,000 (gigabytes). + - `MB`, `M`, `mb`, `m`: the number is multiplied by 1,000,000 (megabytes). + - `KB`, `K`, `kb`, `k`: the number is multiplied by 1,000 (kilobytes). + - `TIB`, `TiB`, `tib`: the number is multiplied by 1,099,511,627,776 (tebibytes). + - `GIB`, `GiB`, `gib`: the number is multiplied by 1,073,741,824 (gibibytes). + - `MIB`, `MiB`, `mib`: the number is multiplied by 1,048,576 (mebibytes). + - `KIB`, `KiB`, `kib`: the number is multiplied by 1,024 (kibibytes). + - `B`, `b`: bytes + + The total amount of RAM detected is logged as an INFO message at + server start. If the variable is set, the overridden value is shown. + Various default sizes are calculated based on this value (e.g. the + RocksDB buffer cache size). + + Setting this option can in particular be useful in two cases: + + 1. If `arangod` is running in a container and its cgroup has a RAM + limitation, then one should specify this limitation in this + environment variable, since it is currently not automatically + detected. + 2. If `arangod` is running alongside other services on the same + machine and thus sharing the RAM with them, one should limit the + amount of memory using this environment variable. + + Note that setting this environment variable mainly affects the default + values of startup options that have to do with memory usage. + If the values of these startup options are explicitly set anyway, then + setting the environment variable has no effect. + + For example, the default value for the RocksDB block cache size + (`--rocksdb.block-cache-size` startup option) depends on the amount of + available memory. If you set `ARANGODB_OVERRIDE_DETECTED_TOTAL_MEMORY=32GB`, + the default value for the block cache size is `(32GB - 2GB) * 0.3 = 9GB`. + However, if you set the `--rocksdb.block-cache-size` startup option explicitly + via a configuration file or via the command-line, then the latter value is + used, and not the option's default value based on the + `ARANGODB_OVERRIDE_DETECTED_TOTAL_MEMORY` environment variable. + + - `ARANGODB_OVERRIDE_DETECTED_NUMBER_OF_CORES` + + This variable can be used to override the automatic detection of the + number of CPU cores present on the system. + + The number of CPU cores detected is logged as an INFO message at + server start. If the variable is set, the overridden value is shown. + Various default values for threading are calculated based on this value. + + Setting this option is useful if `arangod` is running in a container + or alongside other services on the same machine and shall not use + all available CPUs. + + - `ARANGODB_OVERRIDE_CRASH_HANDLER` + + This variable can be used to toggle the built-in crash handler in the + Linux builds of `arangod`. The crash handler is turned on by default + for Linux builds, and it can be turned off by setting this environment + variable to an empty string, the value of `0` or `off`. + +- `CACHE_OBLIVIOUS` _(introduced in v3.9.7, v3.10.3)_ + + If set to the string `true`, jemalloc allocates one additional page + (4096 bytes) for every allocation of 16384 or more bytes to change the + base address if it is not divisible by 4096. This can help the CPU caches if + the beginning of such blocks are accessed a lot. + + On the other hand, it increases the memory usage because of the page alignment. + The RocksDB buffer cache does most of its allocations for 16384 bytes, + increasing the RAM usage by 25%. Setting the option to `false` disables the + optimization but the performance is expected to be the same for ArangoDB. + + The default is `true` in 3.9 and 3.10 up to v3.10.3. From v3.10.4 onwards, + the default is `false`. + + Also see the [jemalloc documentation](http://jemalloc.net/jemalloc.3.html#opt.cache_oblivious). + +- `TZ_DATA` _(introduced in v3.8.0)_ + + This variable can be used to specify the path to the directory containing + the timezone information database for ArangoDB. That directory is normally + named `tzdata` and is shipped with ArangoDB releases. It is normally not + required to set this environment variable, but it may be necessary in + unusual setups with non-conventional directory layouts and paths. + +- `IRESEARCH_TEXT_STOPWORD_PATH` + + Path to a directory with stop word files for + [ArangoSearch Text Analyzers](../../index-and-search/analyzers.md#text). + + + +For Docker specific environment variables please refer to +[Docker Hub](https://hub.docker.com/_/arangodb) diff --git a/site/content/arangodb/oem/components/arangodb-server/ldap.md b/site/content/arangodb/oem/components/arangodb-server/ldap.md new file mode 100644 index 0000000000..2fde26e69f --- /dev/null +++ b/site/content/arangodb/oem/components/arangodb-server/ldap.md @@ -0,0 +1,563 @@ +--- +title: ArangoDB Server LDAP Options +menuTitle: LDAP +weight: 10 +description: >- + LDAP authentication options in the ArangoDB server +--- +{{< tag "ArangoDB Enterprise Edition" "AMP" >}} + +## Basics Concepts + +The basic idea is that one can keep the user authentication setup for +an ArangoDB instance (single or cluster) outside of ArangoDB in an LDAP +server. A crucial feature of this is that one can add and withdraw users +and permissions by only changing the LDAP server and in particular +without touching the ArangoDB instance. Changes are effective in +ArangoDB within a few minutes. + +Since there are many different possible LDAP setups, we must support a +variety of possibilities for authentication and authorization. Here is +a short overview: + +To map ArangoDB user names to LDAP users there are two authentication +methods called "simple" and "search". In the "simple" method the LDAP bind +user is derived from the ArangoDB user name by prepending a prefix and +appending a suffix. For example, a user "alice" could be mapped to the +distinguished name `uid=alice,dc=arangodb,dc=com` to perform the LDAP +bind and authentication. +See [Simple authentication method](#simple-authentication-method) +below for details and configuration options. + +In the "search" method there are two phases. In Phase 1 a generic +read-only admin LDAP user account is used to bind to the LDAP server +first and search for an LDAP user matching the ArangoDB user name. In +Phase 2, the actual authentication is then performed against the LDAP +user that was found in phase 1. Both methods are sensible and are +recommended to use in production. +See [Search authentication method](#search-authentication-method) +below for details and configuration options. + +Once the user is authenticated, there are now two methods for +authorization: (a) "roles attribute" and (b) "roles search". + +In method (a) ArangoDB acquires a list of roles the authenticated LDAP +user has from the LDAP server. The actual access rights to databases +and collections for these roles are configured in ArangoDB itself. +Users effectively have the union of all access rights of all roles +they have. This method is probably the most common one for production use +cases. It combines the advantages of managing users and roles outside of +ArangoDB in the LDAP server with the fine grained access control within +ArangoDB for the individual roles. See [Roles attribute](#roles-attribute) +below for details about method (a) and for the associated configuration +options. + +Method (b) is very similar and only differs from (a) in the way the +actual list of roles of a user is derived from the LDAP server. +See [Roles search](#roles-search) below for details about method (b) +and for the associated configuration options. + +## Fundamental options + +The fundamental options for specifying how to access the LDAP server are +the following: + + - `--ldap.enabled` this is a boolean option which must be set to + `true` to activate the LDAP feature + - `--ldap.server` is a string specifying the host name or IP address + of the LDAP server + - `--ldap.port` is an integer specifying the port the LDAP server is + running on, the default is `389` + - `--ldap.basedn` specifies the base distinguished name under which + the search takes place (can alternatively be set via `--ldap.url`) + - `--ldap.binddn` and `--ldap.bindpasswd` are distinguished name and + password for a read-only LDAP user to which ArangoDB can bind to + search the LDAP server. Note that it is necessary to configure these + for both the "simple" and "search" authentication methods, since + even in the "simple" method, ArangoDB occasionally has to refresh + the authorization information from the LDAP server + even if the user session persists and no new authentication is + needed! It is, however, allowed to leave both empty, but then the + LDAP server must be readable with anonymous access. + - `--ldap.refresh-rate` is a floating point value in seconds. The + default is 300, which means that ArangoDB refreshes the + authorization information for authenticated users after at most 5 + minutes. This means that changes in the LDAP server like removed + users or added or removed roles for a user are effective after + at most 5 minutes. + +Note that the `--ldap.server` and `--ldap.port` options can +alternatively be specified in the `--ldap.url` string together with +other configuration options. For details see Section "LDAP URLs" below. + +Here is an example on how to configure the connection to the LDAP server, +with anonymous bind: + +``` +--ldap.enabled=true \ +--ldap.server=ldap.arangodb.com \ +--ldap.basedn=dc=arangodb,dc=com +``` + +With this configuration ArangoDB binds anonymously to the LDAP server +on host `ldap.arangodb.com` on the default port 389 and executes all searches +under the base distinguished name `dc=arangodb,dc=com`. + +If we need a user to read in LDAP here is the example for it: + +``` +--ldap.enabled=true \ +--ldap.server=ldap.arangodb.com \ +--ldap.basedn=dc=arangodb,dc=com \ +--ldap.binddn=uid=arangoadmin,dc=arangodb,dc=com \ +--ldap.bindpasswd=supersecretpassword +``` + +The connection is identical but the searches are executed with the +given distinguished name in `binddn`. + +Note here: +The given user (or the anonymous one) needs at least read access on +all user objects to find them and in the case of Roles search +also read access on the objects storing the roles. + +Up to this point ArangoDB can now connect to a given LDAP server +but it is not yet able to authenticate users properly with it. +For this pick one of the following two authentication methods. + +### LDAP URLs + +As an alternative one can specify the values of multiple LDAP related configuration +options by specifying a single LDAP URL. Here is an example: + +``` +--ldap.url ldap://ldap.arangodb.com:1234/dc=arangodb,dc=com?uid?sub +``` + +This one option has the combined effect of setting the following: + +``` +--ldap.server=ldap.arangodb.com \ +--ldap.port=1234 \ +--ldap.basedn=dc=arangodb,dc=com \ +--ldap.searchAttribute=uid \ +--ldap.searchScope=sub +``` + +That is, the LDAP URL consists of the LDAP `server` and `port`, a `basedn`, a +`searchAttribute`, and a `searchScope` which can be one of `base`, `one`, or +`sub`. There is also the possibility to use the `ldaps` protocol as in: + +``` +--ldap.url ldaps://ldap.arangodb.com:636/dc=arangodb,dc=com?uid?sub +``` + +This does exactly the same as the one above, except that it uses the +LDAP over TLS protocol. This is a non-standard method which does not +involve using the STARTTLS protocol. Note that this does not work in the +Windows version! We suggest to use the `ldap` protocol and STARTTLS +as described in the next section. + +### TLS options + +{{< warning >}} +TLS is not supported in the Windows version of ArangoDB! +{{< /warning >}} + +To configure the usage of encrypted TLS to communicate with the LDAP server +the following options are available: + +- `--ldap.tls`: the main switch to active TLS. can either be + `true` (use TLS) or `false` (do not use TLS). It is switched + off by default. If you switch this on and do not use the `ldaps` + protocol via the [LDAP URL](#ldap-urls), then ArangoDB + uses the `STARTTLS` protocol to initiate TLS. This is the + recommended approach. +- `--ldap.tls-version`: the minimal TLS version that ArangoDB should accept. + Available versions are `1.0`, `1.1` and `1.2`. The default is `1.2`. If + your LDAP server does not support Version 1.2, you have to change + this setting. +- `--ldap.tls-cert-check-strategy`: strategy to validate the LDAP server + certificate. Available strategies are `never`, `hard`, + `demand`, `allow` and `try`. The default is `hard`. +- `--ldap.tls-cacert-file`: a file path to one or more (concatenated) + certificate authority certificates in PEM format. + As default no file path is configured. This certificate + is used to validate the server response. +- `--ldap.tls-cacert-dir`: a directory path to certificate authority certificates in + [c_rehash](https://www.openssl.org/docs/man3.0/man1/c_rehash.html) + format. As default no directory path is configured. + +Assuming you have the TLS CAcert file that is given to the server at +`/path/to/certificate.pem`, here is an example on how to configure TLS: + +``` +--ldap.tls true \ +--ldap.tls-cacert-file /path/to/certificate.pem +``` + +You can use TLS with any of the following authentication mechanisms. + +### Secondary server options (`ldap2`) + +The `ldap.*` options configure the primary LDAP server. It is possible to +configure a secondary server with the `ldap2.*` options to use it as a +fail-over for the case that the primary server is not reachable, but also to +let the primary servers handle some users and the secondary others. + +Instead of `--ldap.