Skip to content

downloader

xsdata.utils.downloader

Downloader

Remote recursive resource downloader.

Helper class to download a schema or a definitions with all their imports locally. The imports paths will be adjusted if necessary.

Parameters:

Name Type Description Default
output Path

The output path

required

Attributes:

Name Type Description
base_path

The base path for the resources

downloaded dict

A cache of the downloaded resources

Source code in xsdata/utils/downloader.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class Downloader:
    """Remote recursive resource downloader.

    Helper class to download a schema or a definitions with all their imports
    locally. The imports paths will be adjusted if necessary.

    Args:
        output: The output path

    Attributes:
        base_path: The base path for the resources
        downloaded: A cache of the downloaded resources
    """

    __slots__ = ("base_path", "downloaded", "output")

    def __init__(self, output: Path):
        """Initialize the downloader."""
        self.output = output
        self.downloaded: dict = {}

    def wget(self, uri: str, location: str | None = None) -> None:
        """Download handler for any uri input with circular protection."""
        if uri not in self.downloaded:
            self.downloaded[uri] = None
            logger.info("Fetching %s", uri)

            input_stream = opener.open(uri).read()  # nosec
            if uri.endswith("wsdl"):
                self.parse_definitions(uri, input_stream)
            else:
                self.parse_schema(uri, input_stream)

            self.write_file(uri, location, input_stream.decode())

    def parse_schema(self, uri: str, content: bytes) -> None:
        """Convert content to a schema instance and process all sub imports."""
        parser = SchemaParser(location=uri)
        schema = parser.from_bytes(content, Schema)
        self.wget_included(schema)

    def parse_definitions(self, uri: str, content: bytes) -> None:
        """Convert content to a definitions instance and process all sub imports."""
        parser = DefinitionsParser(location=uri)
        definitions = parser.from_bytes(content, Definitions)
        self.wget_included(definitions)

        for schema in definitions.schemas:
            self.wget_included(schema)

    def wget_included(self, definition: Schema | Definitions) -> None:
        """Download the definitions included resources."""
        for included in definition.included():
            if included.location:
                schema_location = getattr(included, "schema_location", None)
                self.wget(included.location, schema_location)

    def write_file(self, uri: str, location: str | None, content: str) -> None:
        """Write the downloaded uri to a local file.

        Keep track of all the written file paths, in case we have to
        modify the location attribute in an upcoming schema/definition
        import.

        Args:
            uri: The resource URI
            location: The import location of the resource
            content: The raw content string
        """
        if uri.startswith("file:"):
            # This happens for bundled schemas (xlink.xsd, xml.xsd)
            # which are intercepted by opener and returned as file://
            if not location or location.startswith("file:"):
                raise ValueError(
                    f"Cannot download local file without HTTP location: {uri}\n"
                )

            parsed = urlparse(location)
        else:
            parsed = urlparse(uri)

        rel_path = parsed.netloc + "/" + parsed.path.lstrip("/")
        file_path = self.output.joinpath(rel_path)

        content = self.adjust_imports(file_path.parent, content)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        file_path.write_text(content, encoding="utf-8")

        logger.info("Writing %s", file_path)
        self.downloaded[uri] = file_path

    def adjust_imports(self, path: Path, content: str) -> str:
        """Update the location of the imports to point to the downloaded files."""
        matches = re.findall(r"ocation=\"(.*)\"", content)
        for match in matches:
            if isinstance(self.downloaded.get(match), Path):
                location = os.path.relpath(self.downloaded[match], path)
                replace = location.replace("\\", "/")
                content = content.replace(f'ocation="{match}"', f'ocation="{replace}"')

        return content

__init__(output)

Initialize the downloader.

Source code in xsdata/utils/downloader.py
29
30
31
32
def __init__(self, output: Path):
    """Initialize the downloader."""
    self.output = output
    self.downloaded: dict = {}

wget(uri, location=None)

Download handler for any uri input with circular protection.

Source code in xsdata/utils/downloader.py
34
35
36
37
38
39
40
41
42
43
44
45
46
def wget(self, uri: str, location: str | None = None) -> None:
    """Download handler for any uri input with circular protection."""
    if uri not in self.downloaded:
        self.downloaded[uri] = None
        logger.info("Fetching %s", uri)

        input_stream = opener.open(uri).read()  # nosec
        if uri.endswith("wsdl"):
            self.parse_definitions(uri, input_stream)
        else:
            self.parse_schema(uri, input_stream)

        self.write_file(uri, location, input_stream.decode())

parse_schema(uri, content)

Convert content to a schema instance and process all sub imports.

Source code in xsdata/utils/downloader.py
48
49
50
51
52
def parse_schema(self, uri: str, content: bytes) -> None:
    """Convert content to a schema instance and process all sub imports."""
    parser = SchemaParser(location=uri)
    schema = parser.from_bytes(content, Schema)
    self.wget_included(schema)

parse_definitions(uri, content)

Convert content to a definitions instance and process all sub imports.

Source code in xsdata/utils/downloader.py
54
55
56
57
58
59
60
61
def parse_definitions(self, uri: str, content: bytes) -> None:
    """Convert content to a definitions instance and process all sub imports."""
    parser = DefinitionsParser(location=uri)
    definitions = parser.from_bytes(content, Definitions)
    self.wget_included(definitions)

    for schema in definitions.schemas:
        self.wget_included(schema)

wget_included(definition)

Download the definitions included resources.

Source code in xsdata/utils/downloader.py
63
64
65
66
67
68
def wget_included(self, definition: Schema | Definitions) -> None:
    """Download the definitions included resources."""
    for included in definition.included():
        if included.location:
            schema_location = getattr(included, "schema_location", None)
            self.wget(included.location, schema_location)

write_file(uri, location, content)

Write the downloaded uri to a local file.

Keep track of all the written file paths, in case we have to modify the location attribute in an upcoming schema/definition import.

Parameters:

Name Type Description Default
uri str

The resource URI

required
location str | None

The import location of the resource

required
content str

The raw content string

required
Source code in xsdata/utils/downloader.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def write_file(self, uri: str, location: str | None, content: str) -> None:
    """Write the downloaded uri to a local file.

    Keep track of all the written file paths, in case we have to
    modify the location attribute in an upcoming schema/definition
    import.

    Args:
        uri: The resource URI
        location: The import location of the resource
        content: The raw content string
    """
    if uri.startswith("file:"):
        # This happens for bundled schemas (xlink.xsd, xml.xsd)
        # which are intercepted by opener and returned as file://
        if not location or location.startswith("file:"):
            raise ValueError(
                f"Cannot download local file without HTTP location: {uri}\n"
            )

        parsed = urlparse(location)
    else:
        parsed = urlparse(uri)

    rel_path = parsed.netloc + "/" + parsed.path.lstrip("/")
    file_path = self.output.joinpath(rel_path)

    content = self.adjust_imports(file_path.parent, content)
    file_path.parent.mkdir(parents=True, exist_ok=True)
    file_path.write_text(content, encoding="utf-8")

    logger.info("Writing %s", file_path)
    self.downloaded[uri] = file_path

adjust_imports(path, content)

Update the location of the imports to point to the downloaded files.

Source code in xsdata/utils/downloader.py
104
105
106
107
108
109
110
111
112
113
def adjust_imports(self, path: Path, content: str) -> str:
    """Update the location of the imports to point to the downloaded files."""
    matches = re.findall(r"ocation=\"(.*)\"", content)
    for match in matches:
        if isinstance(self.downloaded.get(match), Path):
            location = os.path.relpath(self.downloaded[match], path)
            replace = location.replace("\\", "/")
            content = content.replace(f'ocation="{match}"', f'ocation="{replace}"')

    return content