diff --git a/docs/make.jl b/docs/make.jl
index 8f4cf17e4..72fb486c9 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -11,7 +11,7 @@ makedocs(modules = [Metalhead, Artifacts, LazyArtifacts, Images, OneHotArrays, D
                    ],
                   "Developer guide" => "contributing.md",
                   "API reference" => [
-                      "api/models.md",
+                      "api/reference.md",
                    ],
                  ],
          format = Documenter.HTML(
diff --git a/docs/src/api/models.md b/docs/src/api/models.md
deleted file mode 100644
index 1b81acba0..000000000
--- a/docs/src/api/models.md
+++ /dev/null
@@ -1,30 +0,0 @@
-```@meta
-CurrentModule = Metalhead
-```
-
-# Models
-
-The API reference for available models in `Metalhead.jl`.
-
-```@docs
-VGG
-ResNet
-WideResNet
-GoogLeNet
-Inception-v3
-Inception-v4
-InceptionResNet-v2
-SqueezeNet
-DenseNet
-ResNeXt
-MobileNetv1
-MobileNetv2
-MobileNetv3
-EfficientNet
-MLPMixer
-ResMLP
-gMLP
-ViT
-ConvNeXt
-ConvMixer
-```
diff --git a/docs/src/api/reference.md b/docs/src/api/reference.md
new file mode 100644
index 000000000..102db29f1
--- /dev/null
+++ b/docs/src/api/reference.md
@@ -0,0 +1,14 @@
+# API Reference
+
+The API reference of `Metalhead.jl`.
+
+**Note**:
+
+```@autodocs
+Modules = [Metalhead]
+```
+
+```@docs
+Metalhead.squeeze_excite
+Metalhead.LayerScale
+```
diff --git a/docs/src/contributing.md b/docs/src/contributing.md
index 75574b033..f126d7bb8 100644
--- a/docs/src/contributing.md
+++ b/docs/src/contributing.md
@@ -16,7 +16,7 @@ To add a new model architecture to Metalhead.jl, you can [open a PR](https://git
 
 - reuse layers from Flux as much as possible (e.g. use `Parallel` before defining a `Bottleneck` struct)
 - adhere as closely as possible to a reference such as a published paper (i.e. the structure of your model should follow intuitively from the paper)
-- use generic functional builders (e.g. [`resnet`](#) is the core function that builds "ResNet-like" models)
+- use generic functional builders (e.g. [`Metalhead.resnet`](@ref) is the core function that builds "ResNet-like" models)
 - use multiple dispatch to add convenience constructors that wrap your functional builder
 
 When in doubt, just open a PR! We are more than happy to help review your code to help it align with the rest of the library. After adding a model, you might consider adding some pre-trained weights (see below).
diff --git a/docs/src/tutorials/quickstart.md b/docs/src/tutorials/quickstart.md
index 460d44a2e..a00854627 100644
--- a/docs/src/tutorials/quickstart.md
+++ b/docs/src/tutorials/quickstart.md
@@ -4,7 +4,7 @@
 using Flux, Metalhead
 ```
 
-Using a model from Metalhead is as simple as selecting a model from the table of [available models](#). For example, below we use the pre-trained ResNet-18 model.
+Using a model from Metalhead is as simple as selecting a model from the table of [available models](@ref API-Reference). For example, below we use the pre-trained ResNet-18 model.
 ```julia
 using Flux, Metalhead
 
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index 3c713839e..a4d2856ce 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -44,7 +44,7 @@ Create a `AlexNet`.
     
     `AlexNet` does not currently support pretrained weights.
 
-See also [`alexnet`](#).
+See also [`alexnet`](@ref).
 """
 struct AlexNet
     layers::Any
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 040a409ab..26bcd7589 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -8,7 +8,7 @@ Creates a single block of ConvNeXt.
 
   - `planes`: number of input channels.
   - `drop_path_rate`: Stochastic depth rate.
-  - `layerscale_init`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`Metalhead.LayerScale`](@ref)
 """
 function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
     layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
@@ -34,7 +34,7 @@ Creates the layers for a ConvNeXt model.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
-  - `layerscale_init`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`Metalhead.LayerScale`](@ref)
     ([reference](https://arxiv.org/abs/2103.17239))
   - `inchannels`: number of input channels.
   - `nclasses`: number of output classes
@@ -87,7 +87,7 @@ Creates a ConvNeXt model.
   - `inchannels`: The number of channels in the input.
   - `nclasses`: number of output classes
 
-See also [`Metalhead.convnext`](#).
+See also [`Metalhead.convnext`](@ref).
 """
 struct ConvNeXt
     layers::Any
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index eb29c4966..f6072f23c 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -64,7 +64,7 @@ Create a DenseNet model
 
   - `inplanes`: the number of input feature maps to the first dense block
   - `growth_rates`: the growth rates of output feature maps within each
-    [`dense_block`](#) (a vector of vectors)
+    [`dense_block`](@ref) (a vector of vectors)
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
@@ -122,7 +122,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
     
     `DenseNet` does not currently support pretrained weights.
 
-See also [`Metalhead.densenet`](#).
+See also [`Metalhead.densenet`](@ref).
 """
 struct DenseNet
     layers::Any
diff --git a/src/convnets/efficientnet.jl b/src/convnets/efficientnet.jl
index 91986fb92..2abd9dc92 100644
--- a/src/convnets/efficientnet.jl
+++ b/src/convnets/efficientnet.jl
@@ -88,7 +88,7 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
     EfficientNet(config::Symbol; pretrain::Bool = false)
 
 Create an EfficientNet model ([reference](https://arxiv.org/abs/1905.11946v5)).
-See also [`efficientnet`](#).
+See also [`efficientnet`](@ref).
 
 # Arguments
 
diff --git a/src/convnets/inception/googlenet.jl b/src/convnets/inception/googlenet.jl
index 54f814479..19880e53b 100644
--- a/src/convnets/inception/googlenet.jl
+++ b/src/convnets/inception/googlenet.jl
@@ -71,7 +71,7 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
     
     `GoogLeNet` does not currently support pretrained weights.
 
-See also [`googlenet`](#).
+See also [`googlenet`](@ref).
 """
 struct GoogLeNet
     layers::Any
diff --git a/src/convnets/inception/inceptionv3.jl b/src/convnets/inception/inceptionv3.jl
index bc5ec3a2b..d273ebc73 100644
--- a/src/convnets/inception/inceptionv3.jl
+++ b/src/convnets/inception/inceptionv3.jl
@@ -159,7 +159,7 @@ end
     Inceptionv3(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
-See also [`inceptionv3`](#).
+See also [`inceptionv3`](@ref).
 
 # Arguments
 
diff --git a/src/convnets/mobilenet/mobilenetv1.jl b/src/convnets/mobilenet/mobilenetv1.jl
index b6d9fe8ee..7e2345758 100644
--- a/src/convnets/mobilenet/mobilenetv1.jl
+++ b/src/convnets/mobilenet/mobilenetv1.jl
@@ -69,7 +69,7 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
   - `nclasses`: The number of output classes
 
-See also [`Metalhead.mobilenetv1`](#).
+See also [`Metalhead.mobilenetv1`](@ref).
 """
 struct MobileNetv1
     layers::Any
diff --git a/src/convnets/mobilenet/mobilenetv2.jl b/src/convnets/mobilenet/mobilenetv2.jl
index 84162e985..4069ee570 100644
--- a/src/convnets/mobilenet/mobilenetv2.jl
+++ b/src/convnets/mobilenet/mobilenetv2.jl
@@ -76,7 +76,7 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
   - `inchannels`: The number of input channels.
   - `nclasses`: The number of output classes
 
-See also [`Metalhead.mobilenetv2`](#).
+See also [`Metalhead.mobilenetv2`](@ref).
 """
 struct MobileNetv2
     layers::Any
diff --git a/src/convnets/mobilenet/mobilenetv3.jl b/src/convnets/mobilenet/mobilenetv3.jl
index 7d06ab14d..95dd0658c 100644
--- a/src/convnets/mobilenet/mobilenetv3.jl
+++ b/src/convnets/mobilenet/mobilenetv3.jl
@@ -108,7 +108,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
   - `inchannels`: The number of channels in the input.
   - `nclasses`: the number of output classes
 
-See also [`Metalhead.mobilenetv3`](#).
+See also [`Metalhead.mobilenetv3`](@ref).
 """
 struct MobileNetv3
     layers::Any
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
index 699edcbe8..707aa2512 100644
--- a/src/convnets/resnets/core.jl
+++ b/src/convnets/resnets/core.jl
@@ -19,7 +19,7 @@ Creates a basic residual block (see [reference](https://arxiv.org/abs/1512.03385
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `drop_block`: the drop block layer
   - `drop_path`: the drop path layer
-  - `attn_fn`: the attention function to use. See [`squeeze_excite`](#) for an example.
+  - `attn_fn`: the attention function to use. See [`Metalhead.squeeze_excite`](@ref) for an example.
 """
 function basicblock(inplanes::Integer, planes::Integer; stride::Integer = 1,
                     reduction_factor::Integer = 1, activation = relu,
@@ -61,7 +61,7 @@ Creates a bottleneck residual block (see [reference](https://arxiv.org/abs/1512.
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `drop_block`: the drop block layer
   - `drop_path`: the drop path layer
-  - `attn_fn`: the attention function to use. See [`squeeze_excite`](#) for an example.
+  - `attn_fn`: the attention function to use. See [`Metalhead.squeeze_excite`](@ref) for an example.
 """
 function bottleneck(inplanes::Integer, planes::Integer; stride::Integer,
                     cardinality::Integer = 1, base_width::Integer = 64,
@@ -139,7 +139,7 @@ end
     resnet_stem(; stem_type = :default, inchannels::Integer = 3, replace_stem_pool = false,
                   norm_layer = BatchNorm, activation = relu)
 
-Builds a stem to be used in a ResNet model. See the `stem` argument of [`resnet`](#) for details
+Builds a stem to be used in a ResNet model. See the `stem` argument of [`Metalhead.resnet`](@ref) for details
 on how to use this function.
 
 # Arguments
diff --git a/src/convnets/resnets/res2net.jl b/src/convnets/resnets/res2net.jl
index 8e054da82..5d15053f0 100644
--- a/src/convnets/resnets/res2net.jl
+++ b/src/convnets/resnets/res2net.jl
@@ -18,7 +18,7 @@ Creates a bottleneck block as described in the Res2Net paper.
   - `activation`: the activation function to use.
   - `norm_layer`: the normalization layer to use.
   - `revnorm`: set to `true` to place the batch norm before the convolution
-  - `attn_fn`: the attention function to use. See [`squeeze_excite`](#) for an example.
+  - `attn_fn`: the attention function to use. See [`Metalhead.squeeze_excite`](@ref) for an example.
 """
 function bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
                      cardinality::Integer = 1, base_width::Integer = 26,
diff --git a/src/convnets/resnets/resnet.jl b/src/convnets/resnets/resnet.jl
index cdccddd4b..ef517e133 100644
--- a/src/convnets/resnets/resnet.jl
+++ b/src/convnets/resnets/resnet.jl
@@ -11,7 +11,7 @@ Creates a ResNet model with the specified depth.
   - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`Metalhead.resnet`](@ref).
 """
 struct ResNet
     layers::Any
@@ -48,7 +48,7 @@ The number of channels in outer 1x1 convolutions is the same.
   - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`Metalhead.resnet`](@ref).
 """
 struct WideResNet
     layers::Any
diff --git a/src/convnets/resnets/resnext.jl b/src/convnets/resnets/resnext.jl
index 8c43d2f62..364807ff0 100644
--- a/src/convnets/resnets/resnext.jl
+++ b/src/convnets/resnets/resnext.jl
@@ -18,7 +18,7 @@ Creates a ResNeXt model with the specified depth, cardinality, and base width.
   - `inchannels`: the number of input channels.
   - `nclasses`: the number of output classes
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`Metalhead.resnet`](@ref).
 """
 struct ResNeXt
     layers::Any
diff --git a/src/convnets/resnets/seresnet.jl b/src/convnets/resnets/seresnet.jl
index da074e57d..611675bd5 100644
--- a/src/convnets/resnets/seresnet.jl
+++ b/src/convnets/resnets/seresnet.jl
@@ -15,7 +15,7 @@ Creates a SEResNet model with the specified depth.
     
     `SEResNet` does not currently support pretrained weights.
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`Metalhead.resnet`](@ref).
 """
 struct SEResNet
     layers::Any
@@ -58,7 +58,7 @@ Creates a SEResNeXt model with the specified depth, cardinality, and base width.
     
     `SEResNeXt` does not currently support pretrained weights.
 
-Advanced users who want more configuration options will be better served by using [`resnet`](#).
+Advanced users who want more configuration options will be better served by using [`Metalhead.resnet`](@ref).
 """
 struct SEResNeXt
     layers::Any
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index 5c688a645..fd17cbc18 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -62,7 +62,7 @@ Create a SqueezeNet
   - `inchannels`: number of input channels.
   - `nclasses`: the number of output classes.
 
-See also [`squeezenet`](#).
+See also [`squeezenet`](@ref).
 """
 struct SqueezeNet
     layers::Any
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index de232d9a3..0b0cffdfa 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -36,7 +36,7 @@ Create VGG convolution layers
 # Arguments
 
   - `config`: vector of tuples `(output_channels, num_convolutions)`
-    for each block (see [`Metalhead.vgg_block`](#))
+    for each block (see [`Metalhead.vgg_block`](@ref))
   - `batchnorm`: set to `true` to include batch normalization after each convolution
   - `inchannels`: number of input channels
 """
@@ -61,7 +61,7 @@ Create VGG classifier (fully connected) layers
 # Arguments
 
   - `imsize`: tuple `(width, height, channels)` indicating the size after
-    the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#))
+    the convolution layers (see [`Metalhead.vgg_convolutional_layers`](@ref))
   - `nclasses`: number of output classes
   - `fcsize`: input and output size of the intermediate fully connected layer
   - `dropout_rate`: the dropout level between each fully connected layer
@@ -86,12 +86,12 @@ Create a VGG model
 
   - `imsize`: input image width and height as a tuple
   - `config`: the configuration for the convolution layers
-    (see [`Metalhead.vgg_convolutional_layers`](#))
+    (see [`Metalhead.vgg_convolutional_layers`](@ref))
   - `inchannels`: number of input channels
   - `batchnorm`: set to `true` to use batch normalization after each convolution
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
-    (see [`Metalhead.vgg_classifier_layers`](#))
+    (see [`Metalhead.vgg_classifier_layers`](@ref))
   - `dropout_rate`: dropout level between fully connected layers
 """
 function vgg(imsize::Dims{2}; config, batchnorm::Bool = false, fcsize::Integer = 4096,
@@ -122,7 +122,7 @@ Construct a VGG model with the specified input image size. Typically, the image
   - `batchnorm`: set to `true` to use batch normalization after each convolution
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
-    (see [`Metalhead.vgg_classifier_layers`](#))
+    (see [`Metalhead.vgg_classifier_layers`](@ref))
   - `dropout_rate`: dropout level between fully connected layers
 """
 struct VGG
@@ -156,7 +156,7 @@ Create a VGG style model with specified `depth`.
   - `inchannels`: number of input channels
   - `nclasses`: number of output classes
 
-See also [`vgg`](#).
+See also [`vgg`](@ref).
 """
 function VGG(depth::Integer; pretrain::Bool = false, batchnorm::Bool = false,
              inchannels::Integer = 3, nclasses::Integer = 1000)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index de214bcbc..d6197c46b 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -25,7 +25,7 @@ Create a convolution + batch normalization pair with activation.
   - `pad`: padding of the convolution kernel
   - `dilation`: dilation of the convolution kernel
   - `groups`: groups for the convolution kernel
-  - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
+  - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function conv_norm(kernel_size, inplanes::Integer, outplanes::Integer, activation = relu;
                    norm_layer = BatchNorm, revnorm::Bool = false, preact::Bool = false,
@@ -93,7 +93,7 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `stride`: stride of the first convolution kernel
   - `pad`: padding of the first convolution kernel
   - `dilation`: dilation of the first convolution kernel
-  - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
+  - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function depthwise_sep_conv_norm(kernel_size, inplanes::Integer, outplanes::Integer,
                                  activation = relu; norm_layer = BatchNorm,
@@ -122,7 +122,7 @@ Create a basic inverted residual block for MobileNet variants
   - `activation`: The activation function for the first two convolution layer
   - `stride`: The stride of the convolutional kernel, has to be either 1 or 2
   - `reduction`: The reduction factor for the number of hidden feature maps
-    in a squeeze and excite layer (see [`squeeze_excite`](#)).
+    in a squeeze and excite layer (see [`Metalhead.squeeze_excite`](@ref)).
 """
 function invertedresidual(kernel_size, inplanes::Integer, hidden_planes::Integer,
                           outplanes::Integer, activation = relu; stride::Integer,
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
index 31c06c07a..94edcb61f 100644
--- a/src/layers/drop.jl
+++ b/src/layers/drop.jl
@@ -25,7 +25,7 @@ regions of size `block_size` in the input. Otherwise, it simply returns the inpu
   - `gamma_scale`: multiplicative factor for `gamma` used. For the calculations,
     refer to [the paper](https://arxiv.org/abs/1810.12890).
 
-If you are an end-user, you do not want this function. Use [`DropBlock`](#) instead.
+If you are an end-user, you do not want this function. Use [`DropBlock`](@ref) instead.
 """
 # TODO add experimental `DropBlock` options from timm such as gaussian noise and
 # more precise `DropBlock` to deal with edges (#188)
diff --git a/src/mixers/gmlp.jl b/src/mixers/gmlp.jl
index ab89baadc..7e492b182 100644
--- a/src/mixers/gmlp.jl
+++ b/src/mixers/gmlp.jl
@@ -86,7 +86,7 @@ Creates a model with the gMLP architecture.
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
-See also [`Metalhead.mlpmixer`](#).
+See also [`Metalhead.mlpmixer`](@ref).
 """
 struct gMLP
     layers::Any
diff --git a/src/mixers/mlpmixer.jl b/src/mixers/mlpmixer.jl
index 37cc271fb..143b0494c 100644
--- a/src/mixers/mlpmixer.jl
+++ b/src/mixers/mlpmixer.jl
@@ -49,7 +49,7 @@ Creates a model with the MLPMixer architecture.
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
-See also [`Metalhead.mlpmixer`](#).
+See also [`Metalhead.mlpmixer`](@ref).
 """
 struct MLPMixer
     layers::Any
diff --git a/src/mixers/resmlp.jl b/src/mixers/resmlp.jl
index 21ad89d65..6032f554b 100644
--- a/src/mixers/resmlp.jl
+++ b/src/mixers/resmlp.jl
@@ -48,7 +48,7 @@ Creates a model with the ResMLP architecture.
   - `inchannels`: the number of input channels
   - `nclasses`: number of output classes
 
-See also [`Metalhead.mlpmixer`](#).
+See also [`Metalhead.mlpmixer`](@ref).
 """
 struct ResMLP
     layers::Any
diff --git a/src/utilities.jl b/src/utilities.jl
index f5737831c..d4e6543d7 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -14,9 +14,7 @@ end
 
 Convenience function for applying an activation function to the output after
 summing up the input arrays. Useful as the `connection` argument for the block
-function in [`resnet`](#).
-
-See also [`reluadd`](#).
+function in [`Metalhead.resnet`](@ref).
 """
 addact(activation = relu, xs...) = activation(sum(xs))
 
@@ -25,9 +23,7 @@ addact(activation = relu, xs...) = activation(sum(xs))
 
 Convenience function for adding input arrays after applying an activation
 function to them. Useful as the `connection` argument for the block function in
-[`resnet`](#).
-
-See also [`addrelu`](#).
+[`Metalhead.resnet`](@ref).
 """
 actadd(activation = relu, xs...) = sum(activation.(x) for x in xs)
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 099d00639..206f40594 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -92,7 +92,7 @@ Creates a Vision Transformer (ViT) model.
   - `pool`: pooling type, either :class or :mean
   - `nclasses`: number of classes in the output
 
-See also [`Metalhead.vit`](#).
+See also [`Metalhead.vit`](@ref).
 """
 struct ViT
     layers::Any