File and classes that inherit from it. File is a special
DataModel that is generated
automatically when creating a DataChain from files, like in
DataChain.from_storage. File
classes include various metadata fields about the underlying file as well as methods to
read from the files and otherwise work with the file contents.
defexport(self,output:str,placement:ExportPlacement="fullpath",use_cache:bool=True,)->None:"""Export file to new location."""ifuse_cache:self._caching_enabled=use_cachedst=self.get_destination_path(output,placement)dst_dir=os.path.dirname(dst)os.makedirs(dst_dir,exist_ok=True)self.save(dst)
defget_destination_path(self,output:str,placement:ExportPlacement)->str:""" Returns full destination path of a file for exporting to some output based on export placement """ifplacement=="filename":path=unquote(self.name)elifplacement=="etag":path=f"{self.etag}{self.get_file_suffix()}"elifplacement=="fullpath":path=unquote(self.get_full_name())source=urlparse(self.source)ifsource.schemeandsource.scheme!="file":path=posixpath.join(source.netloc,path)elifplacement=="checksum":raiseNotImplementedError("Checksum placement not implemented yet")else:raiseValueError(f"Unsupported file export placement: {placement}")returnposixpath.join(output,path)# type: ignore[union-attr]
defget_local_path(self)->Optional[str]:"""Return path to a file in a local cache. Returns None if file is not cached. Raises an exception if cache is not setup. """ifself._catalogisNone:raiseRuntimeError("cannot resolve local file path because catalog is not setup")returnself._catalog.cache.get_path(self)
@contextmanagerdefopen(self,mode:Literal["rb","r"]="rb")->Iterator[Any]:"""Open the file and return a file object."""ifself.location:withVFileRegistry.resolve(self,self.location)asf:# type: ignore[arg-type]yieldfelse:ifself._caching_enabled:self.ensure_cached()client:Client=self._catalog.get_client(self.source)withclient.open_object(self,use_cache=self._caching_enabled,cb=self._download_cb)asf:yieldio.TextIOWrapper(f)ifmode=="r"elsef
defresolve(self)->"Self":""" Resolve a File object by checking its existence and updating its metadata. Returns: File: The resolved File object with updated metadata. """ifself._catalogisNone:raiseRuntimeError("Cannot resolve file: catalog is not set")try:client=self._catalog.get_client(self.source)exceptNotImplementedErrorase:raiseRuntimeError(f"Unsupported protocol for file source: {self.source}")frometry:info=client.fs.info(client.get_full_path(self.path))converted_info=client.info_to_file(info,self.source)returntype(self)(path=self.path,source=self.source,size=converted_info.size,etag=converted_info.etag,version=converted_info.version,is_latest=converted_info.is_latest,last_modified=converted_info.last_modified,location=self.location,)except(FileNotFoundError,PermissionError,OSError)ase:logger.warning("File system error when resolving %s: %s",self.path,str(e))returntype(self)(path=self.path,source=self.source,size=0,etag="",version="",is_latest=True,last_modified=TIME_ZERO,location=self.location,)
@classmethoddefopen(cls,file:"File",location:list[dict]):"""Stream file from tar archive based on location in archive."""iflen(location)>1:VFileError(file,"multiple 'location's are not supported yet")loc=location[0]if(offset:=loc.get("offset",None))isNone:VFileError(file,"'offset' is not specified")if(size:=loc.get("size",None))isNone:VFileError(file,"'size' is not specified")if(parent:=loc.get("parent",None))isNone:VFileError(file,"'parent' is not specified")tar_file=File(**parent)tar_file._set_stream(file._catalog)client=file._catalog.get_client(tar_file.source)fd=client.open_object(tar_file,use_cache=file._caching_enabled)returnFileSlice(fd,offset,size,file.name)
@contextmanagerdefopen(self,mode:Literal["rb","r"]="r"):"""Open the file and return a file object (default to text mode)."""withsuper().open(mode=mode)asstream:yieldstream