mirror of
https://github.com/littlefs-project/littlefs.git
synced 2025-12-01 12:20:02 +00:00
Changed scripts to not infer field purposes from CSV values
Note there's a bit of subtlety here, field _types_ are still infered, but the intention of the fields, i.e. if the field contains data vs row name/other properties, must be unambiguous in the scripts. There is still a _tiny_ bit of inference. For most scripts only one of --by or --fields is strictly needed, since this makes the purpose of the other fields unambiguous. The reason for this change is so the scripts are a bit more reliable, but also because this simplifies the data parsing/inference a bit. Oh, and this also changes field inference to use the csv.DictReader's fieldnames field instead of only inspecting the returned dicts. This should also save a bit of O(n) overhead when parsing CSV files.
This commit is contained in:
@ -251,11 +251,15 @@ def openio(path, mode='r', buffering=-1):
|
||||
|
||||
def collect(csv_paths, renames=[], defines=[]):
|
||||
# collect results from CSV files
|
||||
fields = []
|
||||
results = []
|
||||
for path in csv_paths:
|
||||
try:
|
||||
with openio(path) as f:
|
||||
reader = csv.DictReader(f, restval='')
|
||||
fields.extend(
|
||||
k for k in reader.fieldnames
|
||||
if k not in fields)
|
||||
for r in reader:
|
||||
# apply any renames
|
||||
if renames:
|
||||
@ -274,49 +278,34 @@ def collect(csv_paths, renames=[], defines=[]):
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
return results
|
||||
return fields, results
|
||||
|
||||
def infer(results, *,
|
||||
def infer(fields_, results,
|
||||
by=None,
|
||||
fields=None,
|
||||
types={},
|
||||
ops={},
|
||||
renames=[],
|
||||
**_):
|
||||
# if fields not specified, try to guess from data
|
||||
if fields is None:
|
||||
fields = co.OrderedDict()
|
||||
for r in results:
|
||||
for k, v in r.items():
|
||||
if (by is None or k not in by) and v.strip():
|
||||
types_ = []
|
||||
for t in fields.get(k, TYPES.values()):
|
||||
try:
|
||||
t(v)
|
||||
types_.append(t)
|
||||
except ValueError:
|
||||
pass
|
||||
fields[k] = types_
|
||||
fields = list(k for k, v in fields.items() if v)
|
||||
|
||||
# deduplicate fields
|
||||
fields = list(co.OrderedDict.fromkeys(fields).keys())
|
||||
|
||||
# if by not specified, guess it's anything not in fields and not a
|
||||
# source of a rename
|
||||
defines=[]):
|
||||
# if by not specified, guess it's anything not in fields/renames/defines
|
||||
if by is None:
|
||||
by = co.OrderedDict()
|
||||
for r in results:
|
||||
# also ignore None keys, these are introduced by csv.DictReader
|
||||
# when header + row mismatch
|
||||
by.update((k, True) for k in r.keys()
|
||||
if k is not None
|
||||
and k not in fields
|
||||
and not any(k == old_k for _, old_k in renames))
|
||||
by = list(by.keys())
|
||||
by = [
|
||||
k for k in fields_
|
||||
if k not in (fields or [])
|
||||
and not any(k == old_k for _, old_k in renames)
|
||||
and not any(k == k_ for k_, _ in defines)]
|
||||
|
||||
# deduplicate fields
|
||||
# if fields not specified, guess it's anything not in by/renames/defines
|
||||
if fields is None:
|
||||
fields = [
|
||||
k for k in fields_
|
||||
if k not in (by or [])
|
||||
and not any(k == old_k for _, old_k in renames)
|
||||
and not any(k == k_ for k_, _ in defines)]
|
||||
|
||||
# deduplicate by/fields
|
||||
by = list(co.OrderedDict.fromkeys(by).keys())
|
||||
fields = list(co.OrderedDict.fromkeys(fields).keys())
|
||||
|
||||
# find best type for all fields
|
||||
types_ = {}
|
||||
@ -381,10 +370,7 @@ def infer(results, *,
|
||||
})
|
||||
|
||||
|
||||
def fold(Result, results, *,
|
||||
by=None,
|
||||
defines=[],
|
||||
**_):
|
||||
def fold(Result, results, by=None, defines=[]):
|
||||
if by is None:
|
||||
by = Result._by
|
||||
|
||||
@ -634,16 +620,21 @@ def main(csv_paths, *,
|
||||
ops_[new_k] = ops[old_k]
|
||||
ops.update(ops_)
|
||||
|
||||
if by is None and fields is None:
|
||||
print("error: needs --by or --fields to figure out fields")
|
||||
sys.exit(-1)
|
||||
|
||||
# find CSV files
|
||||
results = collect(csv_paths, renames=renames, defines=defines)
|
||||
fields_, results = collect(csv_paths, renames, defines)
|
||||
|
||||
# homogenize
|
||||
Result = infer(results,
|
||||
Result = infer(fields_, results,
|
||||
by=by,
|
||||
fields=fields,
|
||||
types=types,
|
||||
ops=ops,
|
||||
renames=renames)
|
||||
renames=renames,
|
||||
defines=defines)
|
||||
results_ = []
|
||||
for r in results:
|
||||
if not any(k in r and r[k].strip()
|
||||
@ -682,7 +673,7 @@ def main(csv_paths, *,
|
||||
|
||||
# find previous results?
|
||||
if args.get('diff'):
|
||||
diff_results = collect([args['diff']], renames=renames, defines=defines)
|
||||
_, diff_results = collect([args['diff']], renames, defines)
|
||||
diff_results_ = []
|
||||
for r in diff_results:
|
||||
if not any(k in r and r[k].strip()
|
||||
|
||||
Reference in New Issue
Block a user