Skip to content

Commit 258467a

Browse files
hunterhectorhector.liu
andauthored
Reader updates and backward compatibility (#360)
* some error handling * A few fields for backward compatibility. * a few datapack api fix * pylint. * Add data. * fix mypy Co-authored-by: hector.liu <[email protected]>
1 parent b6986d2 commit 258467a

File tree

7 files changed

+113
-42
lines changed

7 files changed

+113
-42
lines changed

forte/data/base_pack.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,10 @@ def __getstate__(self):
9797

9898
def __setstate__(self, state):
9999
super().__setstate__(state)
100-
self.__dict__['_pending_entries'] = {}
101-
self.__control_component: Optional[str] = None
100+
if 'meta' in self.__dict__:
101+
self._meta = self.__dict__.pop('meta')
102+
self.__control_component = None
103+
self._pending_entries = {}
102104

103105
@abstractmethod
104106
def _init_meta(self, pack_name: Optional[str] = None) -> BaseMeta:
@@ -220,8 +222,8 @@ def add_all_remaining_entries(self, component: Optional[str] = None):
220222
def serialize(self, drop_record: Optional[bool] = False) -> str:
221223
r"""Serializes a pack to a string."""
222224
if drop_record:
223-
self.creation_records.clear()
224-
self.field_records.clear()
225+
self._creation_records.clear()
226+
self._field_records.clear()
225227

226228
return jsonpickle.encode(self, unpicklable=True)
227229

@@ -249,9 +251,9 @@ def record_entry(self, entry: Entry, component_name: Optional[str] = None):
249251

250252
if c is not None:
251253
try:
252-
self.creation_records[c].add(entry.tid)
254+
self._creation_records[c].add(entry.tid)
253255
except KeyError:
254-
self.creation_records[c] = {entry.tid}
256+
self._creation_records[c] = {entry.tid}
255257

256258
def record_field(self, entry_id: int, field_name: str):
257259
"""
@@ -269,9 +271,9 @@ def record_field(self, entry_id: int, field_name: str):
269271

270272
if c is not None:
271273
try:
272-
self.field_records[c].add((entry_id, field_name))
274+
self._field_records[c].add((entry_id, field_name))
273275
except KeyError:
274-
self.field_records[c] = {(entry_id, field_name)}
276+
self._field_records[c] = {(entry_id, field_name)}
275277

276278
def on_entry_creation(self, entry: Entry,
277279
component_name: Optional[str] = None):
@@ -346,12 +348,12 @@ def get_single(self, entry_type: Type[EntryType]) -> EntryType:
346348
raise EntryNotFoundError(
347349
f"The entry {entry_type} is not found in the provided pack.")
348350

349-
def get_ids_by_component(self, component: str) -> Set[int]:
351+
def get_ids_by_creator(self, component: str) -> Set[int]:
350352
r"""Look up the component_index with key ``component``."""
351-
entry_set: Set[int] = self.creation_records[component]
353+
entry_set: Set[int] = self._creation_records[component]
352354
return entry_set
353355

354-
def get_entries_by_component(self, component: str) -> Set[EntryType]:
356+
def get_entries_by_creator(self, component: str) -> Set[EntryType]:
355357
"""
356358
Return all entries created by the particular component, an unordered
357359
set.
@@ -363,13 +365,13 @@ def get_entries_by_component(self, component: str) -> Set[EntryType]:
363365
364366
"""
365367
return {self.get_entry(tid)
366-
for tid in self.get_ids_by_component(component)}
368+
for tid in self.get_ids_by_creator(component)}
367369

368-
def get_ids_by_components(self, components: List[str]) -> Set[int]:
370+
def get_ids_by_creators(self, components: List[str]) -> Set[int]:
369371
"""Look up component_index using a list of components."""
370372
valid_component_id: Set[int] = set()
371373
for component in components:
372-
valid_component_id |= self.get_ids_by_component(component)
374+
valid_component_id |= self.get_ids_by_creator(component)
373375
return valid_component_id
374376

375377
def get_ids_by_type(self, entry_type: Type[EntryType]) -> Set[int]:

forte/data/container.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
# pylint: disable=function-redefined,multiple-statements
2020

2121
from abc import abstractmethod
22-
from typing import Dict, Generic, Set, Tuple, TypeVar
22+
from typing import Dict, Generic, Set, Tuple, TypeVar, Iterator
2323

2424
from forte.data.span import Span
2525

@@ -66,11 +66,11 @@ def current_id_counter(self) -> int:
6666
class EntryContainer(Generic[E, L, G]):
6767
def __init__(self):
6868
# Record the set of entries created by some components.
69-
self.creation_records: Dict[str, Set[int]] = {}
69+
self._creation_records: Dict[str, Set[int]] = {}
7070

7171
# Record the set of fields modified by this component. The 2-tuple
7272
# identify the entry field, such as (2, lemma).
73-
self.field_records: Dict[str, Set[Tuple[int, str]]] = {}
73+
self._field_records: Dict[str, Set[Tuple[int, str]]] = {}
7474

7575
# The Id manager controls the ID management in this container
7676
self._id_manager = EntryIdManager()
@@ -93,8 +93,16 @@ def __setstate__(self, state):
9393
- The :class:`IdManager` is recreated from the id count.
9494
"""
9595
self.__dict__.update(state)
96-
self.__dict__.pop('serialization')
97-
self._id_manager = EntryIdManager(state['serialization']['next_id'])
96+
97+
if 'creation_records' in self.__dict__:
98+
self._creation_records = self.__dict__.pop('creation_records')
99+
100+
if 'field_records' in self.__dict__:
101+
self._field_records = self.__dict__.pop('field_records')
102+
103+
if 'serialization' in self.__dict__:
104+
self._id_manager = EntryIdManager(
105+
self.__dict__.pop('serialization')['next_id'])
98106

99107
@abstractmethod
100108
def on_entry_creation(self, entry: E):
@@ -130,5 +138,8 @@ def get_span_text(self, span: Span):
130138
def get_next_id(self):
131139
return self._id_manager.get_id()
132140

141+
def get_all_creator(self) -> Iterator[str]:
142+
yield from self._creation_records.keys()
143+
133144

134145
ContainerType = TypeVar("ContainerType", bound=EntryContainer)

forte/data/data_pack.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,16 @@ def __setstate__(self, state):
105105
"""
106106
super().__setstate__(state)
107107

108+
# For backward compatibility.
109+
if 'replace_back_operations' in self.__dict__:
110+
self.__replace_back_operations = self.__dict__.pop(
111+
'replace_back_operations')
112+
if 'processed_original_spans' in self.__dict__:
113+
self.__processed_original_spans = self.__dict__.pop(
114+
'processed_original_spans')
115+
if 'orig_text_len' in self.__dict__:
116+
self.__orig_text_len = self.__dict__.pop('orig_text_len')
117+
108118
self.annotations = SortedList(self.annotations)
109119
self.links = SortedList(self.links)
110120
self.groups = SortedList(self.groups)
@@ -632,7 +642,7 @@ def get_data(self, context_type: Type[Annotation],
632642
if context_components:
633643
valid_component_id: Set[int] = set()
634644
for component in context_components:
635-
valid_component_id |= self.get_ids_by_component(component)
645+
valid_component_id |= self.get_ids_by_creator(component)
636646
valid_context_ids &= valid_component_id
637647

638648
skipped = 0
@@ -855,7 +865,8 @@ def get(self, entry_type: Type[EntryType], # type: ignore
855865
range_annotation: Optional[Annotation] = None,
856866
components: Optional[Union[str, List[str]]] = None
857867
) -> Iterable[EntryType]:
858-
r"""This is a shorthand alias to :func:`get_entries`
868+
r"""This function is used to get data from a data pack with various
869+
methods.
859870
860871
Example:
861872
@@ -875,9 +886,9 @@ def get(self, entry_type: Type[EntryType], # type: ignore
875886
range_annotation (Annotation, optional): The range of entries
876887
requested. If `None`, will return valid entries in the range of
877888
whole data_pack.
878-
components (str or list, optional): The component generating the
879-
entries requested. If `None`, will return valid entries
880-
generated by any component.
889+
components (str or list, optional): The component (creator)
890+
generating the entries requested. If `None`, will return valid
891+
entries generated by any component.
881892
"""
882893
# If we don't have any annotations, then we yield an empty list.
883894
# Note that generics do not work with annotations.
@@ -891,7 +902,7 @@ def get(self, entry_type: Type[EntryType], # type: ignore
891902
if components is not None:
892903
if isinstance(components, str):
893904
components = [components]
894-
valid_id &= self.get_ids_by_components(components)
905+
valid_id &= self.get_ids_by_creators(components)
895906

896907
# Generics do not work with range_annotation.
897908
if issubclass(entry_type, Generics):
@@ -968,7 +979,7 @@ class DataIndex(BaseIndex):
968979
def __init__(self):
969980
super().__init__()
970981
self._coverage_index: Dict[Tuple[Type[Annotation], Type[EntryType]],
971-
Dict[int, Set[int]]] = dict()
982+
Dict[int, Set[int]]] = dict()
972983
self._coverage_index_valid = True
973984

974985
@property

forte/data/multi_pack.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ def get(self, entry_type: Type[EntryType], # type: ignore
534534
if components is not None:
535535
if isinstance(components, str):
536536
components = [components]
537-
valid_id &= self.get_ids_by_components(components)
537+
valid_id &= self.get_ids_by_creators(components)
538538

539539
for entry_id in valid_id:
540540
yield self.get_entry(entry_id)

forte/data/ontology/top.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
import numpy as np
1919

20-
from forte.common.exception import IncompleteEntryError
2120
from forte.data.base_pack import PackType
2221
from forte.data.ontology.core import Entry, BaseLink, BaseGroup, MultiEntry
2322
from forte.data.span import Span
@@ -282,15 +281,51 @@ def __init__(
282281
@property
283282
def parent(self) -> Tuple[int, int]:
284283
if self._parent is None:
285-
raise IncompleteEntryError("Parent is not set for this link.")
284+
raise ValueError("Parent is not set for this link.")
286285
return self._parent
287286

288287
@property
289288
def child(self) -> Tuple[int, int]:
290289
if self._child is None:
291-
raise IncompleteEntryError("Child is not set for this link.")
290+
raise ValueError("Child is not set for this link.")
292291
return self._child
293292

293+
def parent_id(self) -> int:
294+
"""
295+
Return the `tid` of the parent entry.
296+
297+
Returns: The `tid` of the parent entry.
298+
"""
299+
return self.parent[1]
300+
301+
def child_id(self) -> int:
302+
"""
303+
Return the `tid` of the child entry.
304+
305+
Returns: The `tid` of the child entry.
306+
"""
307+
return self.child[1]
308+
309+
def parent_pack_id(self) -> int:
310+
"""
311+
Return the `pack_id` of the parent pack.
312+
313+
Returns: The `pack_id` of the parent pack..
314+
"""
315+
if self._parent is None:
316+
raise ValueError("Parent is not set for this link.")
317+
return self.pack.packs[self._parent[0]].pack_id
318+
319+
def child_pack_id(self) -> int:
320+
"""
321+
Return the `pack_id` of the child pack.
322+
323+
Returns: The `pack_id` of the child pack.
324+
"""
325+
if self._child is None:
326+
raise ValueError("Child is not set for this link.")
327+
return self.pack.packs[self._child[0]].pack_id
328+
294329
def set_parent(self, parent: Entry):
295330
r"""This will set the `parent` of the current instance with given Entry.
296331
The parent is saved internally as a tuple: ``pack index`` and
@@ -331,7 +366,7 @@ def get_parent(self) -> Entry:
331366
An instance of :class:`Entry` that is the parent of the link.
332367
"""
333368
if self._parent is None:
334-
raise IncompleteEntryError("The parent of this link is not set.")
369+
raise ValueError("The parent of this link is not set.")
335370

336371
pack_idx, parent_tid = self._parent
337372
return self.pack.get_subentry(pack_idx, parent_tid)
@@ -343,7 +378,7 @@ def get_child(self) -> Entry:
343378
An instance of :class:`Entry` that is the child of the link.
344379
"""
345380
if self._child is None:
346-
raise IncompleteEntryError("The parent of this link is not set.")
381+
raise ValueError("The parent of this link is not set.")
347382

348383
pack_idx, child_tid = self._child
349384
return self.pack.get_subentry(pack_idx, child_tid)

forte/data/readers/deserialize_reader.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import logging
1415
import os
1516
from abc import ABC, abstractmethod
1617

17-
from typing import Iterator, List, Any, Union
18+
from typing import Iterator, List, Any, Union, Optional
1819

1920
from forte.common.exception import ProcessExecutionException
2021
from forte.data.data_pack import DataPack
@@ -126,14 +127,21 @@ def _parse_pack(self, multi_pack_str: str) -> Iterator[MultiPack]:
126127

127128
for pid in m_pack.pack_ids():
128129
p_content = self._get_pack_content(pid)
130+
if p_content is None:
131+
logging.warning(
132+
"Cannot locate the data pack with pid %d "
133+
"for multi pack %d", pid, m_pack.pack_id)
134+
break
129135
pack: DataPack
130136
if isinstance(p_content, str):
131137
pack = DataPack.deserialize(p_content)
132138
else:
133139
pack = p_content
134140
# Only in deserialization we can do this.
135141
m_pack.packs.append(pack)
136-
yield m_pack
142+
else:
143+
# No multi pack will be yield if there are packs not located.
144+
yield m_pack
137145

138146
@abstractmethod
139147
def _get_multipack_content(self, *args: Any, **kwargs: Any
@@ -148,7 +156,7 @@ def _get_multipack_content(self, *args: Any, **kwargs: Any
148156
raise NotImplementedError
149157

150158
@abstractmethod
151-
def _get_pack_content(self, pack_id: int) -> Union[str, DataPack]:
159+
def _get_pack_content(self, pack_id: int) -> Union[None, str, DataPack]:
152160
"""
153161
Implementation of this method should be responsible for returning the
154162
raw string of the data pack from the pack id.
@@ -182,10 +190,14 @@ def _get_multipack_content(self) -> Iterator[str]: # type: ignore
182190
self.configs.multi_pack_dir, f)) as m_data:
183191
yield m_data.read()
184192

185-
def _get_pack_content(self, pack_id: int) -> str:
186-
with open(os.path.join(
187-
self.configs.data_pack_dir, f'{pack_id}.json')) as pack_data:
188-
return pack_data.read()
193+
def _get_pack_content(self, pack_id: int) -> Optional[str]:
194+
pack_path = os.path.join(
195+
self.configs.data_pack_dir, f'{pack_id}.json')
196+
if os.path.exists(pack_path):
197+
with open(pack_path) as pack_data:
198+
return pack_data.read()
199+
else:
200+
return None
189201

190202
@classmethod
191203
def default_configs(cls):

forte/data/readers/stave_readers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"""
2020

2121
import sqlite3
22-
from typing import Iterator, Dict
22+
from typing import Iterator, Dict, Optional
2323

2424
from forte.common import Resources, ProcessorConfigError
2525
from forte.common.configuration import Config
@@ -85,8 +85,8 @@ def _get_multipack_content(self) -> Iterator[str]: # type: ignore
8585
f'SELECT textPack FROM {self.configs.multipack_table}'):
8686
yield value[0]
8787

88-
def _get_pack_content(self, pack_id: int) -> DataPack:
89-
return self.data_packs[pack_id]
88+
def _get_pack_content(self, pack_id: int) -> Optional[DataPack]:
89+
return self.data_packs.get(pack_id, None)
9090

9191
@classmethod
9292
def default_configs(cls):

0 commit comments

Comments
 (0)