Add text as html to orig elements chunks #15123
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: CI | |
on: | |
push: | |
branches: [ main ] | |
pull_request: | |
branches: [ main ] | |
merge_group: | |
branches: [ main ] | |
permissions: | |
id-token: write | |
contents: read | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
jobs: | |
setup: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11", "3.12"] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
check-only: 'true' | |
check-deps: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11", "3.12"] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Check for dependency conflicts | |
run: make check-deps | |
check-extras: | |
strategy: | |
matrix: | |
python-version: [ "3.9","3.10","3.11","3.12" ] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Install all extras | |
run: make check-extras | |
check-licenses: | |
strategy: | |
matrix: | |
python-version: [ "3.12" ] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
# NOTE(robinson) - dependencies are installed first because liccheck | |
# produces an error if there is a a mismatch between the dep version | |
# in the requirements file and the dep version in site packages | |
- name: Install all doc and test dependencies | |
run: | | |
make install-ci | |
make check-licenses | |
lint: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11"] | |
runs-on: ubuntu-latest | |
needs: [setup, changelog] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Lint | |
run: | | |
source .venv/bin/activate | |
make install-ci | |
make check | |
shellcheck: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: ShellCheck | |
uses: ludeeus/action-shellcheck@master | |
shfmt: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: setup shfmt | |
uses: mfinelli/setup-shfmt@v3 | |
- name: Run shfmt | |
run: shfmt -i 2 -d . | |
test_unit: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11", "3.12"] | |
runs-on: ubuntu-latest | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
TESSERACT_VERSION : "5.4.1" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
installed_tesseract_version=$(tesseract --version | grep -oP '(?<=tesseract )\d+\.\d+\.\d+') | |
if [ "$installed_tesseract_version" != "${{env.TESSERACT_VERSION}}" ]; then | |
echo "Tesseract version ${{env.TESSERACT_VERSION}} is required but found version $installed_tesseract_version" | |
exit 1 | |
fi | |
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again | |
make install-ci | |
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true | |
make check-coverage | |
test_unit_no_extras: | |
strategy: | |
matrix: | |
python-version: ["3.10"] | |
runs-on: ubuntu-latest | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
run: | | |
source .venv/bin/activate | |
make install-ci | |
make install-nltk-models | |
make test-no-extras CI=true | |
test_unit_dependency_extras: | |
# NOTE(newelh) - Split extras into separate steps in the same pipeline (avoid using matrix) | |
strategy: | |
matrix: | |
python-version: ["3.10"] | |
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"] | |
runs-on: ubuntu-latest | |
needs: [setup, lint, test_unit_no_extras] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- uses: actions/cache/restore@v4 | |
id: virtualenv-cache | |
with: | |
path: | | |
nltk_data | |
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} | |
- name: Setup virtual environment | |
run: | | |
python${{ matrix.python-version}} -m venv .venv-${{ matrix.extra }} | |
source .venv-${{ matrix.extra }}/bin/activate | |
make install-base-ci | |
make install-${{ matrix.extra }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
run: | | |
source .venv-${{ matrix.extra }}/bin/activate | |
# NOTE(newelh) - determine what needs to be installed here | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
make install-${{ matrix.extra }} | |
make test-extra-${{ matrix.extra }} CI=true | |
setup_ingest: | |
strategy: | |
matrix: | |
python-version: [ "3.9","3.10" ] | |
runs-on: ubuntu-latest | |
needs: [setup] | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
check-only: 'true' | |
test_ingest_src: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10"] | |
runs-on: ubuntu-latest-m | |
needs: [setup_ingest, lint] | |
steps: | |
# actions/checkout MUST come before auth | |
- uses: 'actions/checkout@v4' | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Get full Python version | |
id: full-python-version | |
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup docker-compose | |
uses: KengoTODA/actions-setup-docker-compose@v1 | |
with: | |
version: '2.22.0' | |
- name: Test (end-to-end) | |
env: | |
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} | |
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} | |
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} | |
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} | |
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} | |
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} | |
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} | |
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }} | |
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} | |
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} | |
MONGODB_URI: ${{ secrets.MONGODB_URI }} | |
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} | |
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} | |
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} | |
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} | |
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} | |
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} | |
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} | |
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} | |
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} | |
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} | |
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} | |
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} | |
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} | |
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} | |
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} | |
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} | |
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }} | |
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} | |
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} | |
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} | |
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}} | |
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" | |
CI: "true" | |
PYTHON: python${{ matrix.python-version }} | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr | |
sudo apt-get install -y tesseract-ocr-kor | |
sudo apt-get install diffstat | |
tesseract --version | |
make install-all-docs | |
make install-ingest | |
./test_unstructured_ingest/test-ingest-src.sh | |
test_unstructured_api_unit: | |
strategy: | |
matrix: | |
# NOTE(yuming): Unstructured API only use Python 3.10 | |
python-version: ["3.10"] | |
runs-on: ubuntu-latest | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Set up flag for running Unstructured API unit tests | |
run: | | |
# NOTE: Change env `SKIP_API_UNIT_FOR_BREAKING_CHANGE` to true if there is a breaking change in Unstructured repo that will break unstructured api unit tests | |
# TODO: Change env back to false once API unit tests is in sync with unstructured repo | |
echo "SKIP_API_UNIT_FOR_BREAKING_CHANGE=true" >> $GITHUB_ENV | |
- name: Set up Python ${{ matrix.python-version }} | |
if: env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false' | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment (no cache hit) | |
if: steps.virtualenv-cache.outputs.cache-hit != 'true' && env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false' | |
run: | | |
python${{ matrix.python-version}} -m venv .venv | |
source .venv/bin/activate | |
mkdir "$NLTK_DATA" | |
make install-ci | |
- name: Test Unstructured API Unit | |
if: env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false' | |
run: | | |
source .venv/bin/activate | |
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again | |
make install-ci | |
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
make install-nltk-models | |
make test-unstructured-api-unit | |
changelog: | |
runs-on: ubuntu-latest | |
steps: | |
# need to checkout otherwise paths-filter will fail on merge-queue trigger | |
- uses: actions/checkout@v4 | |
- if: github.ref != 'refs/heads/main' | |
uses: dorny/paths-filter@v3 | |
id: changes | |
with: | |
filters: | | |
src: | |
- 'unstructured/**' | |
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' | |
uses: dangoslen/changelog-enforcer@v3 | |
# TODO - figure out best practice for caching docker images | |
# (Using the virtualenv to get pytest) | |
test_dockerfile: | |
runs-on: ubuntu-latest-m | |
needs: [ setup, lint ] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Test Dockerfile | |
run: | | |
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file | |
make docker-build | |
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true | |
- name: Scan image | |
uses: anchore/scan-action@v3 | |
with: | |
image: "unstructured:dev" | |
severity-cutoff: critical | |
only-fixed: true |