Skip to content

Commit

Permalink
add the ability to only download the primary html document (if it exi…
Browse files Browse the repository at this point in the history
…sts)
  • Loading branch information
tylerjthomas9 committed Jan 7, 2025
1 parent f058add commit 4f778c6
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "ScrapeSEC"
uuid = "856806e7-be2f-4540-8165-3a51303b7af0"
authors = ["tylerjthomas9 <[email protected]>"]
version = "1.1.0"
version = "1.1.1"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
84 changes: 73 additions & 11 deletions src/download_filings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,35 @@ Function to return the text. Added this to allow custom text cleaning functions
"""
_pass_text(text) = text

function _extract_period_end_date(text::String)::String
i = findfirst("CONFORMED PERIOD OF REPORT:", text)
if i !== nothing
line_end = findnext('\n', text, i.stop)
if line_end !== nothing
date_str = text[(i.stop + 1):(line_end - 1)]
return strip(date_str)
end
end
return ""
end

function get_primary_document_url(
full_url::String, full_text::String, index_text::String
)::String
period_end_date = _extract_period_end_date(full_text)
primary_file_name = replace(split(full_url, "/edgar/")[2], "-" => "")
primary_file_name = replace(primary_file_name, ".txt" => "") * "/"
edgar_prefix = "https://www.sec.gov/Archives/"
pattern = Regex("edgar/$primary_file_name\\w+-$(period_end_date)\\.htm")
match_obj = match(pattern, index_text)
if match_obj !== nothing
primary_doc_url = edgar_prefix * match_obj.match
else
primary_doc_url = ""
end
return primary_doc_url
end

"""
```julia
function download_filing(file_name::String,
Expand All @@ -18,20 +47,40 @@ Parameters
* `new_file`: new local file
* `dest`: destination folder
* `clean_text`: function to clean text before writing to file
* `primary_document`: if true only download the primary document (if it exists)
"""
function download_filing(
file_name::String, new_file::String, dest::String; clean_text::Function=_pass_text
file_name::String,
new_file::String,
dest::String;
clean_text::Function=_pass_text,
primary_document::Bool=false,
)
# company_folder = joinpath(dest, split(new_file, "/")[end - 1])
# if !isdir(company_folder)
# mkdir(company_folder)
# end
full_url = "https://www.sec.gov/Archives/" * file_name
text = HTTP.get(full_url).body
text = String(HTTP.get(full_url).body)

company_folder = joinpath(dest, split(new_file, "/")[end - 1])
if !isdir(company_folder)
mkdir(company_folder)
if primary_document
index_url = replace(full_url, ".txt" => "-index.html")
index_text = String(HTTP.get(index_url).body)
primary_doc_url = get_primary_document_url(full_url, text, index_text)
if primary_doc_url != ""
try
text = String(HTTP.get(primary_doc_url).body)
catch e
# println("Failed to download primary document from $primary_doc_url")
# println("Using full text instead")
pass
end
end
end

text = clean_text(text)
open(new_file, "w") do f
write(f, clean_text(text))
write(f, text)
end

return nothing
Expand All @@ -48,7 +97,8 @@ function download_filings(
stop_pbar=true::Bool,
pbar_desc="Downloading Filings"::String,
running_tests=false::Bool,
clean_text=nothing
clean_text=nothing,
primary_document=false,
)
```
Expand All @@ -62,6 +112,7 @@ Parameters
* `pbar_desc`: pbar Description
* `runnings_tests`: If true, only downloads one file
* `clean_text`: function to clean text before writing to file
* `primary_document`: if true only download the primary document (if it exists)
"""
function download_filings(
filenames::AbstractVector;
Expand All @@ -71,6 +122,7 @@ function download_filings(
pbar_desc="Downloading Filings"::String,
running_tests=false::Bool,
clean_text::Function=_pass_text,
primary_document::Bool=false,
)
if download_rate > 10
download_rate = 10
Expand All @@ -89,7 +141,9 @@ function download_filings(
sleep_time = 1 / download_rate

if skip_file
filenames = filter(file -> !isfile(joinpath(dest, replace(file, "edgar/data/" => ""))), filenames)
filenames = filter(
file -> !isfile(joinpath(dest, replace(file, "edgar/data/" => ""))), filenames
)
end

if isempty(filenames)
Expand All @@ -100,7 +154,7 @@ function download_filings(
for file in filenames
full_file = joinpath(dest, replace(file, "edgar/data/" => ""))

@async download_filing(file, full_file, dest; clean_text)
@async download_filing(file, full_file, dest; clean_text, primary_document)

next!(p)
sleep(sleep_time)
Expand All @@ -126,7 +180,8 @@ function download_filings(
stop_pbar=true::Bool,
pbar_desc="Downloading Filings"::String,
running_tests=false::Bool,
clean_text=nothing
clean_text=nothing,
primary_document::Bool=false,
)
```
Expand All @@ -141,6 +196,7 @@ Parameters
* `pbar_desc`: pbar Description
* `runnings_tests`: If true, only downloads one file
* `clean_text`: function to clean text before writing to file
* `primary_document`: if true only download the primary document (if it exists)
"""
function download_filings(
metadata_file::String;
Expand All @@ -151,6 +207,7 @@ function download_filings(
pbar_desc="Downloading Filings"::String,
running_tests=false::Bool,
clean_text::Function=_pass_text,
primary_document::Bool=false,
)
if download_rate > 10
download_rate = 10
Expand All @@ -176,6 +233,7 @@ function download_filings(
pbar_desc=pbar_desc,
running_tests=running_tests,
clean_text,
primary_document,
)

return nothing
Expand All @@ -194,7 +252,8 @@ function download_filings(
skip_file=true::Bool,
skip_metadata_file=true::Bool,
running_tests=false::Bool,
clean_text=nothing
clean_text=nothing,
primary_document::Bool=false,
)
```
Expand All @@ -212,6 +271,7 @@ Parameters
* `skip_metadata_file`: If true, existing metadata files will be skipped
* `runnings_tests`: If true, only downloads one file
* `clean_text`: function to clean text before writing to file
* `primary_document`: if true only download the primary document (if it exists)
"""
function download_filings(
start_year::Int,
Expand All @@ -225,6 +285,7 @@ function download_filings(
skip_metadata_file=true::Bool,
running_tests=false::Bool,
clean_text::Function=_pass_text,
primary_document::Bool=false,
)
current_date = Dates.now() - Dates.Day(1) #https://github.com/tylerjthomas9/ScrapeSEC.jl/issues/24
current_year = Dates.year(current_date)
Expand Down Expand Up @@ -259,6 +320,7 @@ function download_filings(
pbar_desc="Downloading $(t[1]) Q$(t[2]) Filings",
running_tests=running_tests,
clean_text,
primary_document,
)
next!(p)
end
Expand Down
4 changes: 1 addition & 3 deletions src/download_metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,7 @@ function download_metadata_files(
n_files = size(urls, 1)
p = Progress(n_files; desc="Downloading Metadata CSVs...")
for url in urls
ScrapeSEC.download_metadata(
url; dest=dest, skip_file=skip_file, verbose=verbose
)
ScrapeSEC.download_metadata(url; dest=dest, skip_file=skip_file, verbose=verbose)
next!(p)
end
finish!(p)
Expand Down
30 changes: 28 additions & 2 deletions test/download_filings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,20 @@
rm(temp_file)
end

@testset "download_filings()" begin
@testset "download_filing() (Download primary document)" begin
file_name = "edgar/data/880794/9999999997-05-050434.txt"
temp_file = "./temp_filing.txt"
dest = "./"
download_filing(file_name, temp_file, dest; primary_document=true)
@test isfile(temp_file)
rm(temp_file)
file_name = "edgar/data/775057/0001096906-21-003058.txt"
download_filing(file_name, temp_file, dest; primary_document=true)
@test isfile(temp_file)
rm(temp_file)
end

@testset "download_filings() (only primary document)" begin
download_filings(
1994,
1994;
Expand All @@ -23,5 +36,18 @@ end
download_filings(1994, 1994; filing_types=["40-F"])

rm("./metadata/1994-QTR4.tsv")
# TODO: Is it safe to clear the temp dir? I dont want to accidently user files
end

@testset "download_filings()" begin
download_filings(
2024,
2024;
quarters=[1],
dest="./temp/",
metadata_dest="./metadata/",
running_tests=true,
primary_document=true,
)
@test isfile("./metadata/2024-QTR1.tsv")
rm("./metadata/2024-QTR1.tsv")
end

0 comments on commit 4f778c6

Please sign in to comment.