Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dvc/dependency/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ def _make_fs(
else:
config = Config.load_file(conf)

# Setup config to the new DVCFileSystem to use the remote repo, but rely on the
# local cache instead of the remote's cache. This avoids re-streaming of data,
# but messes up the call to `_get_remote_config()` downstream, which will need
# to ignore cache parameters.
config["cache"] = self.repo.config["cache"]
config["cache"]["dir"] = self.repo.cache.local_cache_dir

Expand Down
22 changes: 19 additions & 3 deletions dvc/repo/open_repo.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import os
import tempfile
import threading
Expand Down Expand Up @@ -50,7 +51,7 @@ def open_repo(url, *args, **kwargs):
if os.path.exists(url):
url = os.path.abspath(url)
try:
config = _get_remote_config(url)
config = _get_remote_config(url, *args, **kwargs)
config.update(kwargs.get("config") or {})
kwargs["config"] = config
return Repo(url, *args, **kwargs)
Expand Down Expand Up @@ -97,9 +98,24 @@ def clean_repos():
_remove(path)


def _get_remote_config(url):
def _get_remote_config(url, *args, **kwargs):
try:
repo = Repo(url)
# Deepcopy to prevent modifying the original `kwargs['config']`
config = copy.deepcopy(kwargs.get("config"))

# Import operations will use this function to get the remote's cache. However,
# while the `url` sent will point to the external repo, the cache information
# in `kwargs["config"]["cache"]["dir"]`) will point to the local repo,
# see `dvc/dependency/repo.py:RepoDependency._make_fs()`
#
# This breaks this function, since we'd be instructing `Repo()` to use the wrong
# cache to being with. We need to remove the cache info from `kwargs["config"]`
# to read the actual remote repo data.
if config:
_ = config.pop("cache", None)

repo = Repo(url, config=config)

except NotDvcRepoError:
return {}

Expand Down
33 changes: 33 additions & 0 deletions tests/func/api/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from dvc import api
from dvc.exceptions import OutputNotFoundError, PathMissingError
from dvc.scm import CloneError, SCMError
from dvc.testing.api_tests import TestAPI # noqa: F401
from dvc.testing.tmp_dir import make_subrepo
from dvc.utils.fs import remove
Expand Down Expand Up @@ -65,6 +66,38 @@ def test_get_url_from_remote(tmp_dir, erepo_dir, cloud, local_cloud):
)


def test_get_url_ignore_scm(tmp_dir, dvc, cloud, scm):
tmp_dir.add_remote(config=cloud.config)
tmp_dir.dvc_gen("foo", "foo", commit="add foo")

repo_posix = tmp_dir.as_posix()
expected_url = (cloud / "files" / "md5" / "ac/bd18db4cc2f85cedef654fccc4a4d8").url

# Test baseline with scm
assert api.get_url("foo", repo=repo_posix) == expected_url

# Simulate gitless environment (e.g. deployed container)
(tmp_dir / ".git").rename(tmp_dir / "gitless_environment")

# Test failure mode when trying to access with git
with pytest.raises(SCMError, match="is not a git repository"):
api.get_url("foo", repo=repo_posix)

# Test successful access by ignoring git
assert (
api.get_url("foo", repo=repo_posix, config={"core": {"no_scm": True}})
== expected_url
)

# Addressing repos with `file://` triggers git, so it fails in a gitless environment
repo_url = f"file://{repo_posix}"
with pytest.raises(
CloneError,
match="SCM error",
):
api.get_url("foo", repo=repo_url, config={"core": {"no_scm": True}})


def test_open_external(tmp_dir, erepo_dir, cloud):
erepo_dir.add_remote(config=cloud.config)

Expand Down
Loading