diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3e70c85 --- /dev/null +++ b/.gitignore @@ -0,0 +1,105 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..53120b3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (C) 2017 Ines Montani + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..90b976f --- /dev/null +++ b/README.md @@ -0,0 +1,79 @@ +# spacycaKE: Keyphrase Extraction for spaCy +[spaCy v2.0](https://spacy.io/usage/v2) extension and pipeline component for Keyphrase Extraction methods meta data to `Doc` objects. + +## Installation +`spacycaKE` requires `spacy` v2.0.0 or higher and `spacybert` v1.0.0 or higher. + +## Usage +### Getting BERT embeddings for single language dataset +``` +import spacy +import spacycake +nlp = spacy.load('en') +``` + +Then either use BertInference as part of a pipeline, +``` +bert = BertInference( + from_pretrained='path/to/pretrained_bert_weights_dir', + set_extension=False) +nlp.add_pipe(bert, last=True) +``` +Or not... +``` +bert = BertInference( + from_pretrained='path/to/pretrained_bert_weights_dir', + set_extension=True) +``` +The difference is that when `set_extension=True`, `bert_repr` is set as a property extension for the Doc, Span and Token spacy objects. If `set_extension=False`, the `bert_repr` is set as an attribute extension with a default value (`=None`). The attribute computes the correct value when `doc._.bert_repr` is called. + +Get the Bert representation / embedding. +``` +doc = nlp("This is a test") +print(doc._.bert_repr) # <-- torch.Tensor +``` + +## Available attributes +The extension sets attributes on the `Doc`, `Span` and `Token`. You can change the attribute name on initializing the extension. +| | | | +|-|-|-| +| `Doc._.cake` | `torch.Tensor` | Document BERT embedding | + +## Settings +On initialization of `BertInference`, you can define the following: + +| name | type | default | description | +|-|-|-|-| +| `from_pretrained` | `str` | `None` | Path to Bert model directory or name of HuggingFace transformers pre-trained Bert weights, e.g., `bert-base-uncased` | +| `attr_name` | `str` | `'bert_repr'` | Name of the BERT embedding attribute to set to the `._` property | +| `max_seq_len` | `int` | 512 | Max sequence length for input to Bert | +| `pooling_strategy` | `str` | `'REDUCE_MEAN'` | Strategy to generate single sentence embedding from multiple word embeddings. See below for the various pooling strategies available. | +| `set_extension` | `bool` | `True` | If `True`, then `'bert_repr'` is set as a property extension for the `Doc`, `Span` and `Token` spacy objects. If `False`, the `'bert_repr'` is set as an attribute extension with a default value (`None`) which gets filled correctly when called in a pipeline. Set it to `False` if you want to use this extension in a spacy pipeline. | +| `force_extension` | `bool` | `True` | A boolean value to create the same 'Extension Attribute' upon being executed again | + +On initialization of `MultiLangBertInference`, you can define the following: + +| name | type | default | description | +|-|-|-|-| +| `from_pretrained` | `Dict[LANG_ISO_639_1, str]` | `None` | Mapping between two-letter language codes to path to model directory or HuggingFace transformers pre-trained Bert weights | +| `attr_name` | `str` | `'bert_repr'` | Same as in BertInference | +| `max_seq_len` | `int` | 512 | Same as in BertInference | +| `pooling_strategy` | `str` | `'REDUCE_MEAN'` | Same as in BertInference | +| `set_extension` | `bool` | `True` | Same as in BertInference | +| `force_extension` | `bool` | `True` | Same as in BertInference | + +## Pooling strategies +| strategy | description | +|-|-| +| `REDUCE_MEAN` | Element-wise average the word embeddings | +| `REDUCE_MAX` | Element-wise maximum of the word embeddings | +| `REDUCE_MEAN_MAX` | Apply both `'REDUCE_MEAN'` and `'REDUCE_MAX'` and concatenate. So if the original word embedding is of dimensions `(768,)`, then the output will have shape `(1536,)` | +| `CLS_TOKEN`, `FIRST_TOKEN` | Take the embedding of only the first `[CLS]` token | +| `SEP_TOKEN`, `LAST_TOKEN` | Take the embedding of only the last `[SEP]` token | +| `None` | No reduction is applied and a matrix of embeddings per word in the sentence is returned | + +## Roadmap +This extension is still experimental. Possible future updates include: +* Getting document representation from other state-of-the-art NLP models other than Google's BERT. +* Method for computing similarity between `Doc`, `Span` and `Token` objects using the `bert_repr` tensor. +* Getting representation from multiple / other layers in the models. \ No newline at end of file diff --git a/notebooks/simple_keyphrase_extraction.ipynb b/notebooks/simple_keyphrase_extraction.ipynb new file mode 100644 index 0000000..9e89e17 --- /dev/null +++ b/notebooks/simple_keyphrase_extraction.ipynb @@ -0,0 +1,635 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "simple_keyphrase_extraction.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "QgvayAEpwgqL", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 768 + }, + "outputId": "7dee1918-6452-4d19-af9e-8e536addf1aa" + }, + "source": [ + "!pip install torch spacybert" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (1.6.0+cu101)\n", + "Requirement already satisfied: spacybert in /usr/local/lib/python3.6/dist-packages (1.0.0)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch) (1.18.5)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch) (0.16.0)\n", + "Requirement already satisfied: transformers>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from spacybert) (3.0.2)\n", + "Requirement already satisfied: spacy-langdetect>=0.1.2 in /usr/local/lib/python3.6/dist-packages (from spacybert) (0.1.2)\n", + "Requirement already satisfied: spacy<3.0.0,>=2.2.1 in /usr/local/lib/python3.6/dist-packages (from spacybert) (2.2.4)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (20.4)\n", + "Requirement already satisfied: tokenizers==0.8.1.rc1 in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (0.8.1rc1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (4.41.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (2019.12.20)\n", + "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (0.0.43)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (0.7)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (3.0.12)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (2.23.0)\n", + "Requirement already satisfied: sentencepiece!=0.1.92 in /usr/local/lib/python3.6/dist-packages (from transformers>=3.0.0->spacybert) (0.1.91)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.6/dist-packages (from spacy-langdetect>=0.1.2->spacybert) (3.6.4)\n", + "Requirement already satisfied: langdetect==1.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy-langdetect>=0.1.2->spacybert) (1.0.7)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (49.2.0)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (0.7.1)\n", + "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (1.0.2)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (7.4.0)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (0.4.1)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (1.0.2)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (2.0.3)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (3.0.2)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (1.0.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy<3.0.0,>=2.2.1->spacybert) (1.1.3)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from packaging->transformers>=3.0.0->spacybert) (1.15.0)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers>=3.0.0->spacybert) (2.4.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers>=3.0.0->spacybert) (0.16.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers>=3.0.0->spacybert) (7.1.2)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=3.0.0->spacybert) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=3.0.0->spacybert) (2020.6.20)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=3.0.0->spacybert) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers>=3.0.0->spacybert) (2.10)\n", + "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest->spacy-langdetect>=0.1.2->spacybert) (19.3.0)\n", + "Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest->spacy-langdetect>=0.1.2->spacybert) (8.4.0)\n", + "Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest->spacy-langdetect>=0.1.2->spacybert) (1.9.0)\n", + "Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest->spacy-langdetect>=0.1.2->spacybert) (0.7.1)\n", + "Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest->spacy-langdetect>=0.1.2->spacybert) (1.4.0)\n", + "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy<3.0.0,>=2.2.1->spacybert) (1.7.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy<3.0.0,>=2.2.1->spacybert) (3.1.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5XXuaLgRxXef", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "outputId": "6b44cbff-4341-49b1-dad7-999f3e2aaf2e" + }, + "source": [ + "!unzip spacycake.zip" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Archive: spacycake.zip\n", + "replace spacycake/about.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: A\n", + " inflating: spacycake/about.py \n", + " inflating: spacycake/__init__.py \n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YovBrmW9wksy", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Based on https://prakhartechviz.blogspot.com/2020/07/unsupervised-keyword-extraction-using-sentence-embeddings.html\n", + "# Reference: http://www.cs.bilkent.edu.tr/~canf/CS533/hwSpring14/eightMinPresentations/handoutMMR.pdf\n", + "from spacybert import BertInference\n", + "from spacy.tokens import Doc\n", + "from spacy.matcher import Matcher\n", + "from spacy.util import filter_spans\n", + "from spacycake import BertKeyphraseExtraction as bake\n", + "import spacy\n", + "import torch\n", + "from typing import Tuple" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "U5qYy2WNxgqz", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 275 + }, + "outputId": "4b07bfba-d10f-42d4-bcc8-772e273be6e4" + }, + "source": [ + "nlp = spacy.load('en')\n", + "doc = nlp(\"\"\"Keywords/Keyphrase extraction is the task of extracting relevant and representative words that best describe the underlying document. Keywords extraction has many use-cases, some of which are using it as meta-data for indexing documents and later using in IR systems, it also plays as a crucial component when gleaning real-time insights.\n", + "\n", + "Although, in literature, this problem has been tried in both supervised and unsupervised settings. This paper proposes EmbedRank - An embedding based unsupervised keyphrase extraction technique from a single document followed by an extra step of re-ranking the candidate phrases for selecting the final set. The proposed technique takes care of explicitly increasing the diversity yet preserving the coverage of the selected candidate keywords. Authors talk about the limitations of existing keyphrase extractor systems which is of generating morphologically and semantic redundant phrases, for example - \"molecular equivalence numbers\" and \"molecular equivalence indices\" mean the same thing and represent the same concept. Getting both of these keywords in output does not necessarily convey anything extra and might hamper user experience. Authors employ Maximal Marginal Relevance (MMR) ranking strategy as trade-off between diversity and relevance.\n", + "\n", + "In short, they propose the technique of representing both document and candidates phrases in a common high-dimensional vector space using existing embeddings model and then later computing meaningful distances between them to ensure informativeness as well as the distance between candidates as the measure of diversity in the final selection.\n", + "\n", + "The method consists of 4 main steps -\n", + "Selecting the first set of candidate phrases from the document based on certain Part-of-Speech patterns. (They keep only phrases that follows the pattern JJ*NN+ (Zero of more adjectives followed by one or multiple nouns)) (Ensures Noise Reduction)\n", + "Vectorization of the document and every candidate phrases extracted from Step.1 into a common high-dimensional vector space. \n", + "Selecting top-k candidates by calculating the cosine similarity between phrases and the document (Ensures Informativeness)\n", + "Re-ranking the top-k selected candidates using MMR (Maximal Marginal Relevance) and then selecting top-n phrases. (Ensures Diversity)\n", + "Authors talk about experimenting with pre-trained Doc2Vec and Sent2Vec models as the choice for embedding documents and candidate phrases in a common embedding space of 700 dimensions. After experimenting with the embeddings they found both the methods to be providing comparable vectors no matter if the input document is a word, sentence, long document. They also mention about Sent2Vec being faster compared to Doc2Vec during inference.\n", + "\n", + "Authors compared their results with existing keyword algorithms on various datasets such as Inspec, DUC 2001, and NUS and found their results to be surpassing current SOTA on Inspec and DUC 2001.\"\"\")\n", + "doc" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Keywords/Keyphrase extraction is the task of extracting relevant and representative words that best describe the underlying document. Keywords extraction has many use-cases, some of which are using it as meta-data for indexing documents and later using in IR systems, it also plays as a crucial component when gleaning real-time insights.\n", + "\n", + "Although, in literature, this problem has been tried in both supervised and unsupervised settings. This paper proposes EmbedRank - An embedding based unsupervised keyphrase extraction technique from a single document followed by an extra step of re-ranking the candidate phrases for selecting the final set. The proposed technique takes care of explicitly increasing the diversity yet preserving the coverage of the selected candidate keywords. Authors talk about the limitations of existing keyphrase extractor systems which is of generating morphologically and semantic redundant phrases, for example - \"molecular equivalence numbers\" and \"molecular equivalence indices\" mean the same thing and represent the same concept. Getting both of these keywords in output does not necessarily convey anything extra and might hamper user experience. Authors employ Maximal Marginal Relevance (MMR) ranking strategy as trade-off between diversity and relevance.\n", + "\n", + "In short, they propose the technique of representing both document and candidates phrases in a common high-dimensional vector space using existing embeddings model and then later computing meaningful distances between them to ensure informativeness as well as the distance between candidates as the measure of diversity in the final selection.\n", + "\n", + "The method consists of 4 main steps -\n", + "Selecting the first set of candidate phrases from the document based on certain Part-of-Speech patterns. (They keep only phrases that follows the pattern JJ*NN+ (Zero of more adjectives followed by one or multiple nouns)) (Ensures Noise Reduction)\n", + "Vectorization of the document and every candidate phrases extracted from Step.1 into a common high-dimensional vector space. \n", + "Selecting top-k candidates by calculating the cosine similarity between phrases and the document (Ensures Informativeness)\n", + "Re-ranking the top-k selected candidates using MMR (Maximal Marginal Relevance) and then selecting top-n phrases. (Ensures Diversity)\n", + "Authors talk about experimenting with pre-trained Doc2Vec and Sent2Vec models as the choice for embedding documents and candidate phrases in a common embedding space of 700 dimensions. After experimenting with the embeddings they found both the methods to be providing comparable vectors no matter if the input document is a word, sentence, long document. They also mention about Sent2Vec being faster compared to Doc2Vec during inference.\n", + "\n", + "Authors compared their results with existing keyword algorithms on various datasets such as Inspec, DUC 2001, and NUS and found their results to be surpassing current SOTA on Inspec and DUC 2001." + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "teAcTA0qxlA0", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 632 + }, + "outputId": "1b1bb598-a349-442d-a156-86119f21e152" + }, + "source": [ + "cake = bake(nlp, from_pretrained='DeepPavlov/bert-base-cased-conversational')\n", + "nlp.add_pipe(cake)\n", + "print(nlp.pipe_names)" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:transformers.tokenization_utils_base:Model name 'DeepPavlov/bert-base-cased-conversational' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming 'DeepPavlov/bert-base-cased-conversational' is a path, a model identifier, or url to a directory containing tokenizer files.\n", + "INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/bert-base-cased-conversational/vocab.txt from cache at /root/.cache/torch/transformers/9a100bf9373875ba9ccb58a277e4e36abcddc75e7d72ec4a0317f436e7881f75.28340c830b588aabe5afd040d5f6f7cec62b34f2a461700d3fd013a18169adf4\n", + "INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/bert-base-cased-conversational/added_tokens.json from cache at None\n", + "INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/bert-base-cased-conversational/special_tokens_map.json from cache at /root/.cache/torch/transformers/65a06bcb4cea6b3252b1f666aae5434be7541361d3b863d2c391565748aa23e4.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4\n", + "INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/bert-base-cased-conversational/tokenizer_config.json from cache at /root/.cache/torch/transformers/617841836a7fb5c9d77429b250c91a6b9cffe6f9be915b497bcb36b0de03e8f9.23dbcd12a881c5aa23ed8b7502b47eedde8257f0130e23919733d1ed9e4ec20d\n", + "INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/bert-base-cased-conversational/tokenizer.json from cache at None\n", + "INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/bert-base-cased-conversational/config.json from cache at /root/.cache/torch/transformers/bbf31f9e9b4164f37fcaf3bff67c159b7e81e80802f4be348346c6bc6de4c58b.8645504c7103f857a49254705845eff03e211fee5a6738748a069c14765a6092\n", + "INFO:transformers.configuration_utils:Model config BertConfig {\n", + " \"architectures\": [\n", + " \"BertModel\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 28996\n", + "}\n", + "\n", + "INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/DeepPavlov/bert-base-cased-conversational/pytorch_model.bin from cache at /root/.cache/torch/transformers/ee9b2524bc1c3741ef2718ac22b5cf96b590e170f754e360f34447d6b65cf6a7.60b71d9156f8b45ee1d70635c4edcf5f29770b77f050955464e5cdc172717d02\n", + "INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing BertModel.\n", + "\n", + "INFO:transformers.modeling_utils:All the weights of BertModel were initialized from the model checkpoint at DeepPavlov/bert-base-cased-conversational.\n", + "If your task is similar to the task the model of the ckeckpoint was trained on, you can already use BertModel for predictions without further training.\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "['tagger', 'parser', 'ner', 'bert_keyphrase_extraction']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "w4eOpImnx7WS", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "8c99d76b-8aee-425e-dc09-40c921bfc895" + }, + "source": [ + "phrases = cake._get_candidate_phrases(doc)\n", + "print(phrases)" + ], + "execution_count": 71, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[extraction, task, representative words, document, Keywords extraction, many use, cases, meta, data, indexing documents, systems, crucial component, time insights, literature, problem, unsupervised settings, paper, embedding, unsupervised keyphrase extraction technique, single document, extra step, re-, candidate phrases, final set, technique, care, diversity, coverage, candidate keywords, Authors, limitations, extractor systems, semantic redundant phrases, example, molecular equivalence numbers, molecular equivalence indices, same thing, same concept, keywords, output, user experience, Authors, strategy, trade, off, diversity, relevance, technique, document, candidates phrases, dimensional vector space, embeddings model, meaningful distances, informativeness, distance, candidates, measure, diversity, final selection, method, main steps, first set, candidate phrases, document, certain Part, Speech patterns, phrases, pattern JJ*NN+, more adjectives, multiple nouns, Vectorization, document, candidate phrases, dimensional vector space, k candidates, cosine similarity, phrases, document, Re, k, candidates, phrases, Authors, trained Doc2Vec, Sent2Vec models, choice, documents, candidate phrases, space, dimensions, embeddings, methods, comparable vectors, input document, word, sentence, long document, Sent2Vec, Doc2Vec, inference, Authors, results, keyword algorithms, various datasets, results, current SOTA]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z7e6Gntnx9Jn", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Get document embedding\n", + "doc_embedding = getattr(doc._, cake.attr_names[0])" + ], + "execution_count": 72, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tDECNbuJFoJ0", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from __future__ import print_function\n", + "from sys import getsizeof, stderr\n", + "from itertools import chain\n", + "from collections import deque\n", + "try:\n", + " from reprlib import repr\n", + "except ImportError:\n", + " pass\n", + "\n", + "def total_size(o, handlers={}, verbose=False):\n", + " \"\"\" Returns the approximate memory footprint an object and all of its contents.\n", + "\n", + " Automatically finds the contents of the following builtin containers and\n", + " their subclasses: tuple, list, deque, dict, set and frozenset.\n", + " To search other containers, add handlers to iterate over their contents:\n", + "\n", + " handlers = {SomeContainerClass: iter,\n", + " OtherContainerClass: OtherContainerClass.get_elements}\n", + "\n", + " \"\"\"\n", + " dict_handler = lambda d: chain.from_iterable(d.items())\n", + " all_handlers = {tuple: iter,\n", + " list: iter,\n", + " deque: iter,\n", + " dict: dict_handler,\n", + " set: iter,\n", + " frozenset: iter,\n", + " }\n", + " all_handlers.update(handlers) # user handlers take precedence\n", + " seen = set() # track which object id's have already been seen\n", + " default_size = getsizeof(0) # estimate sizeof object without __sizeof__\n", + "\n", + " def sizeof(o):\n", + " if id(o) in seen: # do not double count the same object\n", + " return 0\n", + " seen.add(id(o))\n", + " s = getsizeof(o, default_size)\n", + "\n", + " if verbose:\n", + " print(s, type(o), repr(o), file=stderr)\n", + "\n", + " for typ, handler in all_handlers.items():\n", + " if isinstance(o, typ):\n", + " s += sum(map(sizeof, handler(o)))\n", + " break\n", + " return s\n", + "\n", + " return sizeof(o)" + ], + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BlMXWFwIx_tQ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "08b7f1f3-f46c-43ab-9d28-b060da43ac19" + }, + "source": [ + "import sys\n", + "import time\n", + "phrases_embeddings = []\n", + "for i, p in enumerate(phrases):\n", + " start = time.time()\n", + " tensor = getattr(p._, cake.attr_names[0])\n", + " phrases_embeddings.append(tensor.tolist())\n", + " print(i, p, '|| time to run:', time.time() - start, '|| size:', total_size(tensor), \"|| size of phrases_embeddings list:\", total_size(phrases_embeddings))" + ], + "execution_count": 73, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0 extraction || time to run: 2.1029794216156006 || size: 72 || size of phrases_embeddings list: 24736\n", + "1 task || time to run: 2.043354034423828 || size: 72 || size of phrases_embeddings list: 49376\n", + "2 representative words || time to run: 2.0594043731689453 || size: 72 || size of phrases_embeddings list: 74016\n", + "3 document || time to run: 2.053642749786377 || size: 72 || size of phrases_embeddings list: 98656\n", + "4 Keywords extraction || time to run: 2.095147132873535 || size: 72 || size of phrases_embeddings list: 123328\n", + "5 many use || time to run: 2.074693202972412 || size: 72 || size of phrases_embeddings list: 147968\n", + "6 cases || time to run: 2.0458600521087646 || size: 72 || size of phrases_embeddings list: 172608\n", + "7 meta || time to run: 2.0508527755737305 || size: 72 || size of phrases_embeddings list: 197248\n", + "8 data || time to run: 2.0721216201782227 || size: 72 || size of phrases_embeddings list: 221952\n", + "9 indexing documents || time to run: 2.0377750396728516 || size: 72 || size of phrases_embeddings list: 246592\n", + "10 systems || time to run: 2.049778461456299 || size: 72 || size of phrases_embeddings list: 271232\n", + "11 crucial component || time to run: 2.0339925289154053 || size: 72 || size of phrases_embeddings list: 295872\n", + "12 time insights || time to run: 2.041782855987549 || size: 72 || size of phrases_embeddings list: 320512\n", + "13 literature || time to run: 2.0482177734375 || size: 72 || size of phrases_embeddings list: 345152\n", + "14 problem || time to run: 2.035208225250244 || size: 72 || size of phrases_embeddings list: 369792\n", + "15 unsupervised settings || time to run: 2.042509078979492 || size: 72 || size of phrases_embeddings list: 394432\n", + "16 paper || time to run: 2.027287721633911 || size: 72 || size of phrases_embeddings list: 419144\n", + "17 embedding || time to run: 2.0407421588897705 || size: 72 || size of phrases_embeddings list: 443784\n", + "18 unsupervised keyphrase extraction technique || time to run: 2.0307812690734863 || size: 72 || size of phrases_embeddings list: 468424\n", + "19 single document || time to run: 2.027956008911133 || size: 72 || size of phrases_embeddings list: 493064\n", + "20 extra step || time to run: 2.058079481124878 || size: 72 || size of phrases_embeddings list: 517704\n", + "21 re- || time to run: 2.0350964069366455 || size: 72 || size of phrases_embeddings list: 542344\n", + "22 candidate phrases || time to run: 2.0270326137542725 || size: 72 || size of phrases_embeddings list: 566984\n", + "23 final set || time to run: 2.0224411487579346 || size: 72 || size of phrases_embeddings list: 591624\n", + "24 technique || time to run: 2.0286247730255127 || size: 72 || size of phrases_embeddings list: 616264\n", + "25 care || time to run: 2.042250633239746 || size: 72 || size of phrases_embeddings list: 640984\n", + "26 diversity || time to run: 2.0252740383148193 || size: 72 || size of phrases_embeddings list: 665624\n", + "27 coverage || time to run: 2.025484085083008 || size: 72 || size of phrases_embeddings list: 690264\n", + "28 candidate keywords || time to run: 2.0284337997436523 || size: 72 || size of phrases_embeddings list: 714904\n", + "29 Authors || time to run: 2.033186197280884 || size: 72 || size of phrases_embeddings list: 739544\n", + "30 limitations || time to run: 2.047975778579712 || size: 72 || size of phrases_embeddings list: 764184\n", + "31 extractor systems || time to run: 2.0386385917663574 || size: 72 || size of phrases_embeddings list: 788824\n", + "32 semantic redundant phrases || time to run: 2.0281944274902344 || size: 72 || size of phrases_embeddings list: 813464\n", + "33 example || time to run: 2.019888401031494 || size: 72 || size of phrases_embeddings list: 838104\n", + "34 molecular equivalence numbers || time to run: 2.0353269577026367 || size: 72 || size of phrases_embeddings list: 862744\n", + "35 molecular equivalence indices || time to run: 2.0525407791137695 || size: 72 || size of phrases_embeddings list: 887472\n", + "36 same thing || time to run: 2.023737668991089 || size: 72 || size of phrases_embeddings list: 912112\n", + "37 same concept || time to run: 2.0303380489349365 || size: 72 || size of phrases_embeddings list: 936752\n", + "38 keywords || time to run: 2.0131826400756836 || size: 72 || size of phrases_embeddings list: 961392\n", + "39 output || time to run: 2.028717279434204 || size: 72 || size of phrases_embeddings list: 986032\n", + "40 user experience || time to run: 2.0523715019226074 || size: 72 || size of phrases_embeddings list: 1010672\n", + "41 Authors || time to run: 2.029707670211792 || size: 72 || size of phrases_embeddings list: 1035312\n", + "42 strategy || time to run: 2.0231292247772217 || size: 72 || size of phrases_embeddings list: 1059952\n", + "43 trade || time to run: 2.029510498046875 || size: 72 || size of phrases_embeddings list: 1084592\n", + "44 off || time to run: 2.033133029937744 || size: 72 || size of phrases_embeddings list: 1109232\n", + "45 diversity || time to run: 2.0688295364379883 || size: 72 || size of phrases_embeddings list: 1133872\n", + "46 relevance || time to run: 2.024880886077881 || size: 72 || size of phrases_embeddings list: 1158608\n", + "47 technique || time to run: 2.012984037399292 || size: 72 || size of phrases_embeddings list: 1183248\n", + "48 document || time to run: 2.015984296798706 || size: 72 || size of phrases_embeddings list: 1207888\n", + "49 candidates phrases || time to run: 2.0442659854888916 || size: 72 || size of phrases_embeddings list: 1232528\n", + "50 dimensional vector space || time to run: 2.0491726398468018 || size: 72 || size of phrases_embeddings list: 1257168\n", + "51 embeddings model || time to run: 2.02207612991333 || size: 72 || size of phrases_embeddings list: 1281808\n", + "52 meaningful distances || time to run: 2.0256924629211426 || size: 72 || size of phrases_embeddings list: 1306448\n", + "53 informativeness || time to run: 2.0309414863586426 || size: 72 || size of phrases_embeddings list: 1331088\n", + "54 distance || time to run: 2.0170273780822754 || size: 72 || size of phrases_embeddings list: 1355728\n", + "55 candidates || time to run: 2.039979934692383 || size: 72 || size of phrases_embeddings list: 1380368\n", + "56 measure || time to run: 2.01228928565979 || size: 72 || size of phrases_embeddings list: 1405008\n", + "57 diversity || time to run: 2.0248279571533203 || size: 72 || size of phrases_embeddings list: 1429648\n", + "58 final selection || time to run: 2.0109784603118896 || size: 72 || size of phrases_embeddings list: 1454400\n", + "59 method || time to run: 2.020981550216675 || size: 72 || size of phrases_embeddings list: 1479040\n", + "60 main steps || time to run: 2.0524165630340576 || size: 72 || size of phrases_embeddings list: 1503680\n", + "61 first set || time to run: 2.0116124153137207 || size: 72 || size of phrases_embeddings list: 1528320\n", + "62 candidate phrases || time to run: 2.028935670852661 || size: 72 || size of phrases_embeddings list: 1552960\n", + "63 document || time to run: 2.027252435684204 || size: 72 || size of phrases_embeddings list: 1577600\n", + "64 certain Part || time to run: 2.0181453227996826 || size: 72 || size of phrases_embeddings list: 1602240\n", + "65 Speech patterns || time to run: 2.04180908203125 || size: 72 || size of phrases_embeddings list: 1626880\n", + "66 phrases || time to run: 2.0329012870788574 || size: 72 || size of phrases_embeddings list: 1651520\n", + "67 pattern JJ*NN+ || time to run: 2.0198657512664795 || size: 72 || size of phrases_embeddings list: 1676160\n", + "68 more adjectives || time to run: 2.0166454315185547 || size: 72 || size of phrases_embeddings list: 1700800\n", + "69 multiple nouns || time to run: 2.0208804607391357 || size: 72 || size of phrases_embeddings list: 1725440\n", + "70 Vectorization || time to run: 2.0295088291168213 || size: 72 || size of phrases_embeddings list: 1750080\n", + "71 document || time to run: 2.0224626064300537 || size: 72 || size of phrases_embeddings list: 1774720\n", + "72 candidate phrases || time to run: 2.0318312644958496 || size: 72 || size of phrases_embeddings list: 1799488\n", + "73 dimensional vector space || time to run: 2.030790090560913 || size: 72 || size of phrases_embeddings list: 1824128\n", + "74 k candidates || time to run: 2.028165102005005 || size: 72 || size of phrases_embeddings list: 1848768\n", + "75 cosine similarity || time to run: 2.0319132804870605 || size: 72 || size of phrases_embeddings list: 1873408\n", + "76 phrases || time to run: 2.01129150390625 || size: 72 || size of phrases_embeddings list: 1898048\n", + "77 document || time to run: 2.1176273822784424 || size: 72 || size of phrases_embeddings list: 1922688\n", + "78 Re || time to run: 2.0762245655059814 || size: 72 || size of phrases_embeddings list: 1947328\n", + "79 k || time to run: 2.1105899810791016 || size: 72 || size of phrases_embeddings list: 1971968\n", + "80 candidates || time to run: 2.0711684226989746 || size: 72 || size of phrases_embeddings list: 1996608\n", + "81 phrases || time to run: 2.0817739963531494 || size: 72 || size of phrases_embeddings list: 2021248\n", + "82 Authors || time to run: 2.0285391807556152 || size: 72 || size of phrases_embeddings list: 2045888\n", + "83 trained Doc2Vec || time to run: 2.0247905254364014 || size: 72 || size of phrases_embeddings list: 2070528\n", + "84 Sent2Vec models || time to run: 2.0404205322265625 || size: 72 || size of phrases_embeddings list: 2095168\n", + "85 choice || time to run: 2.0214293003082275 || size: 72 || size of phrases_embeddings list: 2119808\n", + "86 documents || time to run: 2.0228817462921143 || size: 72 || size of phrases_embeddings list: 2144448\n", + "87 candidate phrases || time to run: 2.0283544063568115 || size: 72 || size of phrases_embeddings list: 2169088\n", + "88 space || time to run: 2.117525815963745 || size: 72 || size of phrases_embeddings list: 2193872\n", + "89 dimensions || time to run: 2.172813653945923 || size: 72 || size of phrases_embeddings list: 2218512\n", + "90 embeddings || time to run: 2.1973555088043213 || size: 72 || size of phrases_embeddings list: 2243152\n", + "91 methods || time to run: 2.035961866378784 || size: 72 || size of phrases_embeddings list: 2267792\n", + "92 comparable vectors || time to run: 2.0142269134521484 || size: 72 || size of phrases_embeddings list: 2292432\n", + "93 input document || time to run: 2.0219504833221436 || size: 72 || size of phrases_embeddings list: 2317072\n", + "94 word || time to run: 2.053673028945923 || size: 72 || size of phrases_embeddings list: 2341712\n", + "95 sentence || time to run: 2.024005651473999 || size: 72 || size of phrases_embeddings list: 2366352\n", + "96 long document || time to run: 2.016026496887207 || size: 72 || size of phrases_embeddings list: 2390992\n", + "97 Sent2Vec || time to run: 2.0236282348632812 || size: 72 || size of phrases_embeddings list: 2415632\n", + "98 Doc2Vec || time to run: 2.0214104652404785 || size: 72 || size of phrases_embeddings list: 2440272\n", + "99 inference || time to run: 2.062432289123535 || size: 72 || size of phrases_embeddings list: 2464912\n", + "100 Authors || time to run: 2.152684211730957 || size: 72 || size of phrases_embeddings list: 2489552\n", + "101 results || time to run: 2.032606601715088 || size: 72 || size of phrases_embeddings list: 2514192\n", + "102 keyword algorithms || time to run: 2.026545763015747 || size: 72 || size of phrases_embeddings list: 2538832\n", + "103 various datasets || time to run: 2.03802752494812 || size: 72 || size of phrases_embeddings list: 2563472\n", + "104 results || time to run: 2.0378637313842773 || size: 72 || size of phrases_embeddings list: 2588112\n", + "105 current SOTA || time to run: 2.028780460357666 || size: 72 || size of phrases_embeddings list: 2612752\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ux2dUm5n9KfN", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "72d394ff-34fe-4c98-aecc-7dc864e79599" + }, + "source": [ + "phrases_embeddings = torch.Tensor(phrases_embeddings)\n", + "start = time.time()\n", + "doc_phrase_similarity = torch.matmul(doc_embedding, phrases_embeddings.transpose(0, 1))\n", + "print(time.time() - start)" + ], + "execution_count": 75, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.0011990070343017578\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "y7ogzZr39Vcu", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Rank phrases based on similarity ascending\n", + "# and limit to top-k phrases\n", + "indices = doc_phrase_similarity.argsort(descending=True)\n", + "phrases_embeddings = phrases_embeddings[indices]\n", + "phrases = [phrases[i] for i in indices.tolist()]\n", + "R = list(range(len(phrases)))\n", + "S = []" + ], + "execution_count": 76, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "KvKSOvpXKdpU", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "2df856c4-42fb-4f56-f187-82c0ec2892d2" + }, + "source": [ + "phrases_embeddings.shape" + ], + "execution_count": 77, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "torch.Size([106, 768])" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 77 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-eZNt54x9WmX", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "e140823f-e8df-4303-f3b9-62dd53d2f91a" + }, + "source": [ + "# Re-rank based on Maximum Marginal Relevance score\n", + "start = time.time()\n", + "while len(R) > 0:\n", + " first_part = torch.matmul(doc_embedding, phrases_embeddings[R].transpose(0, 1))\n", + " second_part = torch.matmul(\n", + " phrases_embeddings[R],\n", + " phrases_embeddings[S].transpose(0, 1)).max(dim=1).values if S else torch.Tensor([0.] * first_part.shape[0])\n", + " scores = (cake.mmr_lambda * first_part) - ((1 - cake.mmr_lambda) * second_part)\n", + " phrase_to_add = R[scores.argmax()]\n", + " R.remove(phrase_to_add)\n", + " S.append(phrase_to_add)\n", + "print(time.time() - start)" + ], + "execution_count": 78, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.904360294342041\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aq7xXbFP9dLA", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "4c20e308-0884-4c27-ca13-d5f0ab6ae08d" + }, + "source": [ + "top_k = 5\n", + "print([phrases[i] for i in S[:cake.top_k]])" + ], + "execution_count": 79, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[molecular equivalence indices, unsupervised keyphrase extraction technique, k candidates, single document, dimensional vector space]\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9d4996a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +spacy>=2.2.1,<3.0.0 +spacybert>=1.0.0 +torch>=1.4.0 -f https://download.pytorch.org/whl/torch_stable.html \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0c60dc7 --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +from pathlib import Path +from setuptools import setup, find_packages + +package_name = 'spacycake' +root = Path(__file__).parent.resolve() + +# Read in package meta from about.py +about_path = root / package_name / 'about.py' +with about_path.open('r', encoding='utf8') as f: + about = {} + exec(f.read(), about) + +# Get readme +readme_path = root / 'README.md' +with readme_path.open('r', encoding='utf8') as f: + readme = f.read() + +install_requires = [ + 'spacy>=2.2.1,<3.0.0', + 'spacybert>=1.0.0', + 'torch>=1.4.0 -f https://download.pytorch.org/whl/torch_stable.html' +] +test_requires = ['pytest'] + +setup( + name=package_name, + description=about['__summary__'], + long_description=readme, + author=about['__author__'], + author_email=about['__email__'], + url=about['__uri__'], + version=about['__version__'], + license=about['__license__'], + packages=find_packages(exclude=('tests*',)), + install_requires=install_requires, + test_requires=test_requires, + zip_safe=False, +) \ No newline at end of file diff --git a/spacycake/__init__.py b/spacycake/__init__.py new file mode 100644 index 0000000..5762a1f --- /dev/null +++ b/spacycake/__init__.py @@ -0,0 +1,115 @@ +from .about import __version__ +from spacybert import BertInference +from spacy.tokens import Doc +from spacy.matcher import Matcher +from spacy.util import filter_spans +import torch +from typing import Tuple + + +class BertKeyphraseExtraction: + """ + Based on the paper "Simple Unsupervised Keyphrase Extraction using Sentence Embedding" + """ + + name = 'bert_keyphrase_extraction' + noun_chunk_pattern = [ + {'POS': 'ADJ', 'OP': '*'}, + {'POS': 'NOUN', 'OP': '+'}] + + def __init__( + self, nlp, *, from_pretrained: str, + attr_names: Tuple[str] = ('bert_repr', 'noun_phrases', 'extracted_phrases'), + force_extension: bool = True, top_k: int = 5, mmr_lambda: float = .5, + **kws): + """ + Keyword arguments only after first argument! + + Params + ------ + nlp: spacy Language + Spacy language object + + from_pretrained: str, None + Path to Bert model directory or name of HuggingFace transformers + pre-trained Bert weights, e.g., 'bert-base-uncased' + + attr_names: Tuple[str] + In order: + 1. Name of the BERT embedding attribute, default = '._.bert_repr' + 2. Name of the candidate phrases attribute, default = '._.noun_phrases' + 3. Name of the top-k extracted phrases attribute, default = '._.extracted_phrases' + + force_extension: bool + A boolean value to create the same 'Extension Attribute' upon being + executed again + + top_k: int + Select the top-k candidate phrases + + mmr_lambda: float [0..1] + Lambda parameter for the maximum marginal relevance re-ranking of keyphrases + + kws: + More keywords arguments to supply to spacybert.BertInference() + """ + assert len(attr_names) == 3 + assert kws.get('pooling_strategy', 0) is not None + assert isinstance(top_k, int) and top_k > 0 + assert 0. <= mmr_lambda <= 1. + self.attr_names = attr_names + self.top_k = top_k + self.mmr_lambda = mmr_lambda + + # Load noun chunks parser + self.matcher = Matcher(nlp.vocab) + self.matcher.add("noun_chunk", None, self.noun_chunk_pattern) + + # Load bert inference spacy extensions + if from_pretrained: + BertInference( + from_pretrained=from_pretrained, + attr_name=attr_names[0], + set_extension=True, force_extension=force_extension, **kws) + else: + import warnings + warnings.warn( + 'from_pretrained not supplied. Will continue assuming' + + f'Doc._.{attr_names[0]} is available.', RuntimeWarning) + + Doc.set_extension(attr_names[1], getter=self._get_candidate_phrases, force=force_extension) + Doc.set_extension(attr_names[2], default=None, force=force_extension) + + def _get_candidate_phrases(self, doc: Doc): + return filter_spans([doc[start:end] for _, start, end in self.matcher(doc)]) + + def __call__(self, doc: Doc): + # Compute similarity between document and phrases + phrases = self._get_candidate_phrases(doc) + doc_embedding = getattr(doc._, self.attr_names[0]) + phrases_embeddings = torch.stack(list( + map(lambda p: getattr(p._, self.attr_names[0]), phrases))) + doc_phrase_similarity = torch.matmul(doc_embedding, phrases_embeddings.transpose(0, 1)) + + # Rank phrases based on similarity ascending + # and limit to top-k phrases + indices = doc_phrase_similarity.argsort(descending=True) + phrases_embeddings = phrases_embeddings[indices] + phrases = [phrases[i] for i in indices.tolist()] + R = list(range(len(phrases))) + S = [] + + # Re-rank based on Maximum Marginal Relevance score + while len(R) > 0: + first_part = torch.matmul(doc_embedding, phrases_embeddings[R].transpose(0, 1)) + second_part = torch.matmul( + phrases_embeddings[R], + phrases_embeddings[S].transpose(0, 1)).max(dim=1).values + scores = (self.mmr_lambda * first_part) - ((1 - self.mmr_lambda) * second_part) + phrase_to_add = R[scores.argmax()] + R.remove(phrase_to_add) + S.append(phrase_to_add) + + doc._.set(self.attr_names[2], [phrases[i] for i in S[:self.top_k]]) + + return doc diff --git a/spacycake/about.py b/spacycake/about.py new file mode 100644 index 0000000..5cabbb7 --- /dev/null +++ b/spacycake/about.py @@ -0,0 +1,7 @@ +__title__ = 'spacycaKE' +__version__ = '1.0.0' +__summary__ = 'spaCy pipeline component for adding Keyphrase Extraction.' +__uri__ = 'https://github.com/surajiyer/spacycaKE' +__author__ = 'Suraj Iyer' +__email__ = 'me@surajiyer.com' +__license__ = 'MIT' \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_ke.py b/tests/test_ke.py new file mode 100644 index 0000000..5871ed8 --- /dev/null +++ b/tests/test_ke.py @@ -0,0 +1 @@ +import pytest