Skip to content

Commit

Permalink
add URL download functionality and rebuild package
Browse files Browse the repository at this point in the history
  • Loading branch information
thartl-diw committed Jan 10, 2025
1 parent 1afd0a3 commit 8cc024f
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 25 deletions.
Binary file added build/lib/opendataformat/data/example_data.zip
Binary file not shown.
18 changes: 9 additions & 9 deletions build/lib/opendataformat/docu_odf.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,38 +57,38 @@ def docu_odf(x, metadata = "all", languages = "all"):
Examples
--------
Extract all metadata from a DataFrame:
>>> import opendataformat as odf
>>> df = pd.DataFrame()
>>> df.attrs = {"label_en": "English Label", "label_fr": "French Label", "url": "https://example.com"}
>>> docu_odf(df)
>>> odf.docu_odf(df)
label_en: English Label
label_fr: French Label
url: https://example.com
Extract specific metadata type:
>>> docu_odf(df, metadata="label")
>>> odf.docu_odf(df, metadata="label")
label_en: English Label
label_fr: French Label
Extract metadata filtered by language:
>>> label = docu_odf(df, metadata="label", languages="en")
>>> label = odf.docu_odf(df, metadata="label", languages="en")
label_en: English Label
>>> print(label)
English Label
Extract dataset level metadata from a DataFrame:
>>> df = read_odf("example_dataset.zip")
>>> df = odf.read_odf("example_dataset.zip")
>>> df.attrs = {'study': 'study name',
'dataset': 'dataset name',
'label_en': 'label in english',
'label_de': 'label in german',
'description_en': 'details in english',
'description_de': 'details in german',
'url': 'https://example.url'}
>>> docu_odf(df)
>>> odf.docu_odf(df)
study: study name
dataset: dataset name
label_en: label in english
Expand All @@ -99,21 +99,21 @@ def docu_odf(x, metadata = "all", languages = "all"):
Extract specific variable metadata:
>>> docu_odf(df['variable_name'])
>>> odf.docu_odf(df['variable_name'])
name:variable
label_en: english label
label_de: german label
url: https://example.url
Extract specific metadata type:
>>> docu_odf(df, metadata="label")
>>> odf.docu_odf(df, metadata="label")
label_en: English label
label_de: German label
Extract metadata filtered by language:
>>> label = docu_odf(df, metadata="label", languages="en")
>>> label = odf.docu_odf(df, metadata="label", languages="en")
label_en: English Label
>>> print(label)
English Label
Expand Down
38 changes: 32 additions & 6 deletions build/lib/opendataformat/read_odf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
import zipfile
import xml.etree.ElementTree as ET
import os


from tempfile import gettempdir
from urllib.request import urlretrieve
from urllib.parse import urlparse



Expand Down Expand Up @@ -53,12 +54,12 @@ def read_odf(path, languages = "all", usecols = None, skiprows=None, nrows=None,
Examples
--------
Read an ODF file and load all columns:
>>> df = read_odf("example_dataset.zip")
>>> import opendataformat as odf
>>> df = odf.read_odf("example_dataset.zip")
Read an ODF zipfile, selecting specific language:
>>> df = read_odf("example.zip", language="en")
>>> df = odf.read_odf("example.zip", languages="en")
"""

Expand All @@ -67,9 +68,29 @@ def read_odf(path, languages = "all", usecols = None, skiprows=None, nrows=None,
if (not path.endswith(".zip") and not os.path.exists(path)) or (not path.endswith(".zip") and os.path.exists(path + ".zip")) :
path = path + ".zip"

if not os.path.exists(path):
if not os.path.exists(path) and not is_url(path):
raise FileNotFoundError(f"The file {path} was not found.")

# Download file to tempdir if path is URL
if is_url(path):
# Get the system's temporary directory
temp_dir = gettempdir()

fname = path.split("/")[-1]
# Define the full path where the file will be saved
file_path = os.path.join(temp_dir, fname)

# Download the file using urllib
try:
urlretrieve(path, file_path)
except Exception:
raise Exception("Downloading file from URL failed.")

path = file_path

if not os.path.exists(path):
FileNotFoundError(f"The file {path} was not found.")

if not path.endswith(".zip") and (not os.path.exists(path + "/data.csv") or not os.path.exists(path + "/metadata.xml")):
raise FileNotFoundError(f"A file {path + '.zip'} was not found and in the folder {path} expected metadata.xml and data.csv.")

Expand Down Expand Up @@ -310,3 +331,8 @@ def make_variables_dic(root, variables):
dictionaries[varname] = dictionary

return dictionaries

def is_url(path):
parsed = urlparse(path)
# A URL typically has a scheme (e.g., "http", "https") and a network location (netloc)
return bool(parsed.scheme) and bool(parsed.netloc)
8 changes: 4 additions & 4 deletions build/lib/opendataformat/write_odf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,18 @@ def write_odf(x, path, languages = "all"):
Examples
--------
Write a DataFrame to an ODF file, including all metadata:
>>> import opendataformat as odf
>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
>>> df.attrs = {"label_en": "English Label", "label_de": "German Label", "description_en": "Example dataset"}
>>> write_odf(df, "output.zip")
>>> odf.write_odf(df, "output.zip")
Write a DataFrame to an ODF file, filtering metadata by language:
>>> write_odf(df, "output.zip", languages="en")
>>> odf.write_odf(df, "output.zip", languages="en")
Write a DataFrame to an ODF file, including metadata for multiple languages:
>>> write_odf(df, "output.zip", languages="all")
>>> odf.write_odf(df, "output.zip", languages="all")
"""

if (not isinstance(x, pd.DataFrame)):
Expand Down
Binary file added dist/opendataformat-0.2-py3-none-any.whl
Binary file not shown.
Binary file added dist/opendataformat-0.2.tar.gz
Binary file not shown.
6 changes: 3 additions & 3 deletions opendataformat.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Metadata-Version: 2.1
Name: opendataformat
Version: 0.1
Version: 0.2
Summary: The Open Data Format (ODF) is a new, non-proprietary, multilingual, metadata enriched, and zip-compressed data format with metadata structured in the Data Documentation Initiative (DDI) Codebook standard. This package allows reading and writing of data files in the Open Data Format (ODF) in R, and displaying metadata in different languages. For further information on the Open Data Format, see <https://opendataformat.github.io/>.
Home-page: https://github.com/opendataformat/py-package-opendataformat
Author: Your Name
Author-email: [email protected]
Author: Xiaoyao Han
Author-email: [email protected]
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Expand Down
4 changes: 1 addition & 3 deletions opendataformat.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,4 @@ opendataformat.egg-info/SOURCES.txt
opendataformat.egg-info/dependency_links.txt
opendataformat.egg-info/requires.txt
opendataformat.egg-info/top_level.txt
opendataformat/data/data.zip
opendataformat/data/data_with_default.zip
opendataformat/data/data_with_missings.zip
opendataformat/data/example_data.zip

0 comments on commit 8cc024f

Please sign in to comment.