config.yml

# Configuration for Rasa NLU for building a COVID-19 chatbot - Elementary Chatbot
language: en

# Configuration for the NLU pipeline
pipeline:
    # 0. Language Model
    # Using SpacyNLP pre-trained language model to initialize spaCy structures and use pre-trained word vectors 
    # (strings/utterances).
    - name: "SpacyNLP"
      model: "en_core_web_md" # English (en) pipeline optimized for CPU with medium size
      case_sensitive: false # Not case-sensitive i.e. "Hello" and "hello" are understood as the same
    # 1. Tokenizers
    # (Not) Using the Whitespace Tokenizer to separate words at whitespaces
    # - name: WhitespaceTokenizer
    # Using SpacyTokenizer to separate the utterances into tokens based on configuration
    - name: SpacyTokenizer
      intent_tokenization_flag: false # Setting it to false to avoid splitting intents
      intent_split_symbol: " " # Delimiter on which intents will be split
      token_pattern: None # NO regular expression to detect tokens
    # 2. Featurizers
    # Using the SpacyFeaturizer to create vector representation of user messages and responses based on spaCy 
    # instruction-set
    - name: SpacyFeaturizer
    # Using the RegexFeaturizer to create vector representation of user messages (only) based on regular expressions
    - name: RegexFeaturizer
    # Using the LexicalSyntacticFeaturizer to create lexical and syntatic features for user messages (only) supporting 
    # entity extraction
    - name: LexicalSyntacticFeaturizer
    # Using the CountVectorsFeaturizer to create groupings of words from user messages, intents and responses
    - name: CountVectorsFeaturizer
      # char_wb creats character n-grams (char) only from text inside the specified word boundaries (wb)
      analyzer: char_wb 
      min_ngram: 1 # Minimum of a unigram
      max_ngram: 4 # Maximum of a quadrigram
    # 3. Intent Classifier & 4. Entity Extractors
    # Using DIETClassifier for both intent classification and entity extraction
    - name: DIETClassifier
      epochs: 100 # Train the classifier for 100 epochs i.e. 100 iterations over the training dataset
      constrain_similarities: true # Using sigmoid (logistic) function to scale down high similarities to low
      # Using softmax to normalize the computed dot products for confidence determination
      model_confidence: softmax 
    # Using EntitySynonymMapper to associate synonymous entities (e.g. "Hi" with "Hello")
    - name: EntitySynonymMapper
    # 5. Selectors
    # Using ResponseSelector to generate a database of predicted responses with its confidence
    - name: ResponseSelector
      epochs: 100 # Train the selector for 100 epochs i.e. 100 iterations over the training dataset
      constrain_similarities: true # Using sigmoid (logistic) function to scale down high similarities to low
    # Using ResponseSelector to generate a database of predicted responses with its confidence
    - name: ResponseSelector
      epochs: 100 # Train the selector for 100 epochs i.e. 100 iterations over the training dataset
      retrieval_intent: chitchat # Handling the Rasa chitchat retrieval intent
      constrain_similarities: true # Using sigmoid (logistic) function to scale down high similarities to low
    # Using ResponseSelector to generate a database of predicted responses with its confidence
    - name: ResponseSelector
      epochs: 100 # Train the selector for 100 epochs i.e. 100 iterations over the training dataset
      retrieval_intent: faq # Handling the FAQs retrieval intent
      constrain_similarities: true # Using sigmoid (logistic) function to scale down high similarities to low
    # 6. FallbackClassifier for assigning boilerplate intents where the actual classification scores are ambiguous or 
    # confidence lower than threshold)
    - name: FallbackClassifier
      # Setting a classification threshold value of 0.3 (any classification scores lower than this, have 
      # "nlu_fallback" intent assigned)
      threshold: 0.3 
      # Setting an ambiguity threshold value of 0.1 (any two different intents closer than this, have "nlu_fallback" 
      # intent assigned)
      ambiguity_threshold: 0.1 

# Configuration for Rasa Core (policies)
policies:
    # Using the MemoizationPolicy to remember the previous stories from training data
    - name: MemoizationPolicy
      max_history: 3 # Sets the maximum history context to 3 turns (1 turn is user request + bot response)
    # Using the RulePolicy to enable handling conversations based on a set of ground rules specified
    - name: RulePolicy
    # Using the TEDPolicy (Transformer Embedding Dialogue) for a multi-tasking approach to action prediction and 
    # entity recognition
    - name: TEDPolicy
      max_history: 5 # Sets the maximum history context to 3 turns (1 turn is user request + bot response)
      epochs: 100 # Train the policy for 100 epochs i.e. 100 iterations over the training dataset
      constrain_similarities: true # Using sigmoid (logistic) function to scale down high similarities to low