Changed scripts to not infer field purposes from CSV values

Note there's a bit of subtlety here, field _types_ are still infered,
but the intention of the fields, i.e. if the field contains data vs
row name/other properties, must be unambiguous in the scripts.

There is still a _tiny_ bit of inference. For most scripts only one
of --by or --fields is strictly needed, since this makes the purpose of
the other fields unambiguous.

The reason for this change is so the scripts are a bit more reliable,
but also because this simplifies the data parsing/inference a bit.

Oh, and this also changes field inference to use the csv.DictReader's
fieldnames field instead of only inspecting the returned dicts. This
should also save a bit of O(n) overhead when parsing CSV files.
This commit is contained in:
Christopher Haster
2023-11-04 15:24:18 -05:00
parent 2be3ff57c5
commit d0a6ef0c89
12 changed files with 187 additions and 200 deletions

View File

@ -251,11 +251,15 @@ def openio(path, mode='r', buffering=-1):
def collect(csv_paths, renames=[], defines=[]):
# collect results from CSV files
fields = []
results = []
for path in csv_paths:
try:
with openio(path) as f:
reader = csv.DictReader(f, restval='')
fields.extend(
k for k in reader.fieldnames
if k not in fields)
for r in reader:
# apply any renames
if renames:
@ -274,49 +278,34 @@ def collect(csv_paths, renames=[], defines=[]):
except FileNotFoundError:
pass
return results
return fields, results
def infer(results, *,
def infer(fields_, results,
by=None,
fields=None,
types={},
ops={},
renames=[],
**_):
# if fields not specified, try to guess from data
if fields is None:
fields = co.OrderedDict()
for r in results:
for k, v in r.items():
if (by is None or k not in by) and v.strip():
types_ = []
for t in fields.get(k, TYPES.values()):
try:
t(v)
types_.append(t)
except ValueError:
pass
fields[k] = types_
fields = list(k for k, v in fields.items() if v)
# deduplicate fields
fields = list(co.OrderedDict.fromkeys(fields).keys())
# if by not specified, guess it's anything not in fields and not a
# source of a rename
defines=[]):
# if by not specified, guess it's anything not in fields/renames/defines
if by is None:
by = co.OrderedDict()
for r in results:
# also ignore None keys, these are introduced by csv.DictReader
# when header + row mismatch
by.update((k, True) for k in r.keys()
if k is not None
and k not in fields
and not any(k == old_k for _, old_k in renames))
by = list(by.keys())
by = [
k for k in fields_
if k not in (fields or [])
and not any(k == old_k for _, old_k in renames)
and not any(k == k_ for k_, _ in defines)]
# deduplicate fields
# if fields not specified, guess it's anything not in by/renames/defines
if fields is None:
fields = [
k for k in fields_
if k not in (by or [])
and not any(k == old_k for _, old_k in renames)
and not any(k == k_ for k_, _ in defines)]
# deduplicate by/fields
by = list(co.OrderedDict.fromkeys(by).keys())
fields = list(co.OrderedDict.fromkeys(fields).keys())
# find best type for all fields
types_ = {}
@ -381,10 +370,7 @@ def infer(results, *,
})
def fold(Result, results, *,
by=None,
defines=[],
**_):
def fold(Result, results, by=None, defines=[]):
if by is None:
by = Result._by
@ -634,16 +620,21 @@ def main(csv_paths, *,
ops_[new_k] = ops[old_k]
ops.update(ops_)
if by is None and fields is None:
print("error: needs --by or --fields to figure out fields")
sys.exit(-1)
# find CSV files
results = collect(csv_paths, renames=renames, defines=defines)
fields_, results = collect(csv_paths, renames, defines)
# homogenize
Result = infer(results,
Result = infer(fields_, results,
by=by,
fields=fields,
types=types,
ops=ops,
renames=renames)
renames=renames,
defines=defines)
results_ = []
for r in results:
if not any(k in r and r[k].strip()
@ -682,7 +673,7 @@ def main(csv_paths, *,
# find previous results?
if args.get('diff'):
diff_results = collect([args['diff']], renames=renames, defines=defines)
_, diff_results = collect([args['diff']], renames, defines)
diff_results_ = []
for r in diff_results:
if not any(k in r and r[k].strip()