Python
pandas replace nan by ""
df = df.fillna("") # vervang alle nan's door "" in het hele dataframe
modify clipboard: for example escape all lines on the clipboard
import pyperclip
text = pyperclip.paste()
updated_text = "\n".join([line.lower() for line in text.splitlines()])
pyperclip.copy(updated_text)
csv.writer to write lut
for img in row["images"]:
images_by_vh_id[img["image_id_vh"]] = img["image_name"]
with open("data/images_by_vh_id.csv", mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['image_id_vh', 'image_name'])
writer.writerows(images_by_vh_id.items())
get inventarisnummer from filename
match = re.search(r'(?<=NL-UtHUA_)(.*)(?=_.*)', filename)
if match:
nummer = match.group(1)
csv.DictReader from URL
with urllib.request.urlopen(url) as response:
return csv.DictReader(response.read().decode('utf-8').splitlines())
csv.DictWriter (csv.writer)
with open("output.csv", mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=["x", "y", "w", "h", "conf", "text"])
writer.writeheader()
writer.writerows(data)
download crops with iiif
import csv,os
import urllib
from urllib.request import urlretrieve
from tqdm import tqdm
aktes_per_scan = {row["nummer"]:int(row["aantal aktes"]) for row in csv.DictReader(open("data/aantal-aktes-op-scan-per-inv.csv"),delimiter=",") }
iiif = "https://files.transkribus.eu/iiif/2/"
def download(url, nummer, filename, suffix):
folder = f"scans/{nummer}"
filepath = folder + "/" + filename.replace(".jpg",f"_{suffix}.jpg")
os.makedirs(folder, exist_ok=True)
if not os.path.exists(filepath):
print(url,filepath)
urllib.request.urlretrieve(url, filepath)
else:
print(f"exists: {url}")
for row in tqdm(list(csv.DictReader(open("data/alle-scans-met-x-voor-overslaan.csv"),delimiter=",")),leave=True):
id = row["iiif id"]
nummer = row["nummer"]
filename = row["filename"]
aantal = aktes_per_scan[nummer]
skip = row["overslaan"]!=""
if skip or aantal==0:
print(f"skip: {iiif}{id}/full/1600,/0/default.jpg")
continue
if aantal==4:
download(f"{iiif}{id}/pct:0,0,50,50/1000,/0/default.jpg",nummer,filename,1)
download(f"{iiif}{id}/pct:0,50,50,50/1000,/0/default.jpg",nummer,filename,2)
download(f"{iiif}{id}/pct:50,0,100,50/1000,/0/default.jpg",nummer,filename,3)
download(f"{iiif}{id}/pct:50,50,100,50/1000,/0/default.jpg",nummer,filename,4)
if aantal==6:
download(f"{iiif}{id}/pct:0,3,50,33/1000,/0/default.jpg",nummer,filename,1)
download(f"{iiif}{id}/pct:0,33,50,33/1000,/0/default.jpg",nummer,filename,2)
download(f"{iiif}{id}/pct:0,66,50,33/1000,/0/default.jpg",nummer,filename,3)
download(f"{iiif}{id}/pct:50,3,50,33/1000,/0/default.jpg",nummer,filename,4)
download(f"{iiif}{id}/pct:50,33,50,33/1000,/0/default.jpg",nummer,filename,5)
download(f"{iiif}{id}/pct:50,66,50,33/1000,/0/default.jpg",nummer,filename,6)
tqdm.write("")
mooi voorbeeld van argparse
mkdir / makedirs
os.makedirs(output_dir, exist_ok=True)
create query for a lookup table from >1000 GUIDs with many OR statements
print ("select guid,id from A where 1=0")
for row in csv.DictReader(open("data/spreadsheet.csv",encoding="utf-8-sig"),delimiter=";"):
guid = row['Guid']
print(f"OR guid='{guid}'")
print(";")
update query from CSV
import csv
with open("result.sql","w") as output_file:
for row in csv.DictReader(open("input.csv",encoding="utf-8-sig"),delimiter=";"):
id = row['id']
beginjaar = row['Beginjaar']
eindjaar = row['Eindjaar']
openbaar_vanaf = row['Openbaar vanaf']
sql=f"update ... set beginjaar='{beginjaar}', eindjaar='{eindjaar}', openbaar_vanaf='{openbaar_vanaf}' where id={id};"
print(sql,file=output_file)
update query with subselect from excel sheet
import pandas as pd
data = pd.read_excel('Regesten toegang 218-1.xlsx')
for index, row in data.iterrows():
guid = row['guid']
waarde = row['Datering (oorspr)'].replace("'", "''")
sql = f"""
UPDATE ... awe
SET awe.WAARDE = '{waarde}'
WHERE awe.fvd_id = 3859 AND awe.ahd_id IN (
SELECT reg.id
FROM ... reg
WHERE reg.guid = '{guid}'
);
"""
print(sql)
Error: WARNING: pip is configured with locations that require TLS/SSL, however the ssl module in Python is not available.
brew --prefix openssl@1.1
repeat items in list
colors = [akte_color]*3 + [overledene_color]*5
#result: ['#f8e5d0', '#f8e5d0', '#f8e5d0', '#d2e2f1', '#d2e2f1', '#d2e2f1', '#d2e2f1', '#d2e2f1']
write unique values from one column to a textfile
print("\n".join(sorted(set(item["NTNI"] for item in items))), file=open("data/ntni.txt","w"))
use Flask to render a CSV as a table using a template
import csv
from flask import Flask, render_template
app = Flask(__name__,template_folder="./")
def process_csv(csv_file):
data = []
with open(csv_file, newline='', encoding='utf-8') as csvfile:
return list(csv.DictReader(csvfile))
@app.route('/')
def index():
csv_file = 'input.csv' # Replace with your CSV file path
data = process_csv(csv_file)
return render_template('template.html', data=data)
if __name__ == '__main__':
app.run(debug=True)
csv to excel using pandas with filter and groupby
import pandas as pd
df = pd.read_csv('input.csv')
df = df[~df['flexveld'].isin(['Aangemaakt door', 'Aangemaakt op', 'Gewijzigd door', 'Gewijzigd op'])]
for name, group in df.groupby('aet_omschrijving'):
writer = pd.ExcelWriter(f'output/{name}.xlsx', engine='xlsxwriter')
group.to_excel(writer, index=False, sheet_name='Sheet1')
workbook = writer.book
worksheet = writer.sheets['Sheet1']
worksheet.set_column('A:B', 30)
(max_row, max_col) = df.shape
column_settings = [{'header': column} for column in df.columns]
writer.sheets['Sheet1'].add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
writer.close()
merge two csv's
ik heb twee CSV's met de volgende headers:
1) org_row,filename,straatnaam,afbeelding,oude straatnaam 2) "uuid","name","width","height","link"
Ik wil graag een nieuwe CSV met de onderstaande velden waarbij de sleutelkolommen 'filename' en 'name' gebruikt kunnen worden om te matchen 2) "uuid","name","width","height","link","straatnaam"
response from ChatGPT 3.5. In 1x goed!
import csv
def merge_csv(csv1_path, csv2_path, output_path):
data1 = {}
with open(csv1_path, 'r') as csv1_file:
reader1 = csv.DictReader(csv1_file)
for row in reader1:
data1[row['filename']] = row['straatnaam']
with open(output_path, 'w', newline='') as output_file:
fieldnames = ["uuid", "name", "width", "height", "link", "straatnaam"]
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writeheader()
with open(csv2_path, 'r') as csv2_file:
reader2 = csv.DictReader(csv2_file)
for row in reader2:
street_name = data1.get(row['name'], '')
row['straatnaam'] = street_name
writer.writerow(row)
if __name__ == "__main__":
csv1_path = "bestand1.csv"
csv2_path = "bestand2.csv"
output_path = "samengevoegd.csv"
merge_csv(csv1_path, csv2_path, output_path)
format turtle file (or convert any rdf to formatted turtle)
from rdflib import Graph
g = Graph()
g.parse("INPUT.ttl")
print(g.serialize(format='ttl',encoding="utf-8").decode(), file=open("OUTPUT.ttl","w"))
upload file via requests
def upload(url,token,filename):
headers = { "Authorization": f"Bearer {token}" }
data = { "mergeGraphs": "true" }
files = { "file": (os.path.basename(filename), open(filename,'rb'), 'application/zip') }
response = requests.post(url, headers=headers, data=data, files=files)
return response.json()
streaming subprocess data through flask
https://stackoverflow.com/a/15354403/6068235
import flask
import subprocess
app = flask.Flask(__name__)
@app.route('/yield')
def index():
def inner():
proc = subprocess.Popen(
['sh upload.sh'], #call something with a lot of output so we can see it
shell=True,
stdout=subprocess.PIPE
)
for line in iter(proc.stdout.readline,''):
yield f"{line.decode('utf8')}<br/>\n"
return flask.Response(inner(), mimetype='text/html') # text/html is required for most browsers to show th$
app.run(debug=True, port=8080, host='0.0.0.0')
spacy
import spacy
text = open("X.txt").read()
nlp = spacy.load("nl_core_news_sm")
doc = nlp(text)
# print(doc.text)
for token in doc:
print(token.text, token.pos_, token.dep_)
ip adres
import socket
hostname = socket.gethostname()
ip_adres = socket.gethostbyname(hostname)
filename without directory and without extension
os.path.splitext(os.path.basename(csv_path))[0]
flatten folder structure for jpg's and PageXML's:
#!/usr/bin/env python3
from pathlib import Path
import os,sys,re,shutil
input_base = "/Users/rick/Loghi (vervolg)/Resultaat fase 2/fase-2_output/"
output_base = "output/"
# toegang_inv_base = "702.2244" # extra submap gemaakt voor alle deelbeschrijvingen
toegang_inv_base = "702.121" # extra submap gemaakt voor alle deelbeschrijvingen
for img_path in Path(input_base + toegang_inv_base).rglob("*.jpg"):
img_folder = os.path.dirname(img_path)
img_filename = os.path.basename(img_path)
toegang_inv = os.path.basename(img_folder)
page_path = img_folder + "/page/" + img_filename.replace(".jpg",".xml")
result = re.split(r'\.', toegang_inv, maxsplit=1) # let op maxsplit
toegang, inv = result
#JPGs
Path(output_base + toegang_inv_base).mkdir(parents=True, exist_ok=True)
shutil.copy2(img_path, output_base + toegang_inv_base)
#pageXMLs
Path(output_base + toegang_inv_base + "/page/" ).mkdir(parents=True, exist_ok=True)
shutil.copy2(page_path, output_base + toegang_inv_base + "/page/")
read PageXML without metadata using regex
def get_pagexml_without_metadata(xml_file):
return re.sub("<Metadata>.*?</Metadata>", "", open(xml_file).read(), flags=re.DOTALL)
tip van Stan
NLTK - Natural Language Toolkit
SMF SimpleMachines Forum Export / Backup to JSON
import mysql.connector, json
def get_column_names(db_name, table_name):
query = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'"
cursor.execute(query)
return [row["COLUMN_NAME"] for row in cursor]
db_name = 'your_db'
db_user = 'root'
db_pass = ''
conn = mysql.connector.connect(user=db_user, password=db_pass, host='127.0.0.1', database=db_name)
cursor = conn.cursor(dictionary=True)
board_columns = get_column_names(db_name,'smf_boards')
topic_columns = get_column_names(db_name,'smf_topics')
message_columns = get_column_names(db_name,'smf_messages')
cursor.execute("""
SELECT * FROM smf_messages m
JOIN smf_topics t ON t.id_topic = m.id_topic
JOIN smf_boards b ON b.id_board = t.id_board
ORDER BY b.id_board, t.id_topic, m.id_msg
""")
structure = {}
for row in cursor:
board_id = row['id_board']
topic_id = row['id_topic']
# make board data
board_data = {col: row[col] for col in board_columns}
# make topic data
topic_data = {col: row[col] for col in topic_columns}
# add board toe if not exits
if board_id not in structure:
structure[board_id] = {'board_data': board_data, 'topics': {}}
# add topic if not exists
if topic_id not in structure[board_id]['topics']:
structure[board_id]['topics'][topic_id] = {'topic_data': topic_data, 'messages': []}
# make message data and add tot topics
message_data = {col: row[col] for col in message_columns}
structure[board_id]['topics'][topic_id]['messages'].append(message_data)
cursor.close()
conn.close()
json.dump(structure, open("result.json","w"), indent=2, ensure_ascii=False)
lookup hierarchy from id's in CSV file and output new CSV
#!/usr/bin/env python3
import csv
items = list(csv.DictReader(open("beschrijvingen-met-parent-guids.csv")))
items_by_guid = {}
#build lut
for row in items:
row["parents"] = row["parents"].split(" ")
items_by_guid[row["GUID"]] = row
#solve refs
for item in items_by_guid.values():
item["parents"] = [ items_by_guid[p_guid] for p_guid in item["parents"] if not item==items_by_guid[p_guid]]
# als er een inleiding gevonden wordt direct onder een rubriek
# dan deze inleiding toevoegen aan de rubriek omschrijving
for item in items:
if item["aet"]=="inl" and item["parents"][-1]["aet"]=="rub":
item["parents"][-1]["omschr"] += f"\n{item['omschr']}"
# clean up
for item in items:
item["omschr"] = item["omschr"].replace(">tr.",">getrouwd met: ")
item["omschr"] = item["omschr"].replace("<BR>","\n")
header = ["guid","text"]
writer = csv.DictWriter(open("items-with-context.csv","w",encoding="utf-8"), fieldnames=header, delimiter=';', quoting=csv.QUOTE_ALL, dialect='excel')
writer.writeheader()
for item in items:
writer.writerow({
"guid": item["GUID"],
"text": "\n".join([p["omschr"] for p in item["parents"]] + [item["omschr"]])
})
Count entities (in many json-files) grouped by entity_type
#!/usr/bin/env python3
import json
from pathlib import Path
from collections import defaultdict
entities_by_type = { }
for filename in list(Path(f"data/").rglob("*.json")):
try:
data = json.load(open(filename))
for item in data["gpt-result"]:
for value in data["gpt-result"][item]:
value = str(value).strip()
if not item in entities_by_type:
entities_by_type[item] = defaultdict(int)
entities_by_type[item][value] += 1
except Exception as e:
print(e,filename)
# sort by count within entity_type
for t in entities_by_type:
entities_by_type[t] = dict(sorted(entities_by_type[t].items(), key=lambda x:x[1], reverse=True))
json.dump(entities_by_type, open("entities.json","w"),indent=2,ensure_ascii=False)
Call GPT-4 API for Namd Entity Recognition (NER)
#!/usr/bin/env python3
import json,csv,os,openai,sys
openai.api_key = os.getenv("OPENAI_API_KEY")
for row in list(csv.DictReader(open("beschrijvingen-platte-lijst.csv")))[0:1000]:
filename = "data/" + row["GUID"] + ".json"
if os.path.exists(filename):
continue
response = openai.ChatCompletion.create(
# model="gpt-3.5-turbo",
model="gpt-4-1106-preview",
response_format={ "type": "json_object" },
messages=[ {"role": "system", "content": "You can recognize named-entities and return these in JSON as strings grouped by entity-type: { 'persons':[], 'organisations':[], 'locations':[], 'topics':[], 'events':[], 'dates':[]"},
{"role": "user", "content": row["context"] } ]
)
try:
if response:
response["text"] = json.loads(response["choices"][0]["message"]["content"])
except Exception as e:
print(e,response)
pass
try:
json.dump({"item":row, "gpt-result":response["text"]},open(filename,"w"), indent=2, ensure_ascii=False)
print(filename, response["text"])
except Exception as e:
print(e,row["GUID"],response)
print(filename)
escape URI part
def escape_URI_part(s):
# spreadsheet: [–’&|,\.() ""$/':;]"; "-") ;"-+";"-"); "[.-]$"; ""))
s = re.sub(r"[_–’+?&=|,\.() \"$/']", "-", s) # replace different characters by a dash
s = re.sub(r"-+", "-", s) # replace multiple dashes by 1 dash
s = re.sub(r"[^a-zA-Z0-9\-]", "", s) # strip anything else now that is not a alpha or numeric character or a dash
s = re.sub(r"^-|-$", "", s) # prevent starting or ending with . or -
if len(s)==0:
# raise ValueError("makeSafeURIPart results in empty string")
# log.warning("makeSafeURIPart results in empty string")
# fix this by replacing by 'x' for example
import random
s="unknown"+str(int(random.randint(10000, 99999)))
return s.lower()
pretty print
from pprint import pprint
print(data)
get all pages
from bs4 import BeautifulSoup
import requests
url = "https://www.website.org/topics/"
while url:
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
url = soup.find("link", rel="next")
if url:
url = url.get("href")
get item by id+value in json array
find item with code=="FT"
item = next((item for item in data if item.get("code") == "FT"), None)
as function:
def get_item(items, key, value):
return next((item for item in items if item.get(key) == value), None)
find multiple values:
def get_items(items, key, values):
return [ next((item for item in items if item.get(key) == value), None) for value in values]
merge list with z_values with 2D nparray
z_values = [img[int(y), int(x), 2] for x, y in points]
points = np.column_stack((points[:, 0], points[:, 1], z_values))
shorter:
points = np.column_stack((points, img[points[:, 1].astype(int), points[:, 0].astype(int), 2]))
pretty print
from pprint import pprint
pprint(data)
merge files from 2 folders to an output folder
#!/usr/bin/env python3
from pathlib import Path
import os,shutil
alto_folder = "XX/input/"
alto_files_by_filename = {os.path.basename(path): path for path in Path(alto_folder).rglob("*.xml")}
image_filepaths = list(Path(f"1001/").rglob("*.jpg"))
for image_path in image_filepaths :
# recreate folder structure from images
image_folder = os.path.dirname(image_path)
image_filename = os.path.basename(image_path)
dst_folder = Path("output/" + image_folder)
dst_folder.mkdir(parents=True, exist_ok=True)
dst_image_path = os.path.join(dst_folder, image_filename)
# copy images
if not os.path.exists(dst_image_path):
print("copy",image_path,"to",dst_image_path)
shutil.copy2(image_path, dst_folder)
# copy and rename alto files
alto_filename = image_filename.replace(".jpg",".xml")
alto_filepath = alto_files_by_filename.get(alto_filename)
if alto_filepath: # skip non-existing alto files
new_alto_filename = alto_filename.replace(".xml","_alto.xml")
dst_alto_filepath = os.path.join(dst_folder, new_alto_filename)
shutil.copy2(alto_filepath, dst_alto_filepath)
store filepaths (recursively) in a dict by filename
alto_files_by_filename = {os.path.basename(path): path for path in Path(alto_folder).rglob("*.xml")}
maak lookup table
lut_technieken = { item["INHOUD"]:item["ID"] for item in csv.DictReader(open("con_technieken.csv")) }
image areas in html to json
from bs4 import BeautifulSoup
import json
with open("index.html", "r", encoding="utf-8") as file:
html = file.read()
soup = BeautifulSoup(html, 'html.parser')
area_tags = soup.find_all('area')
json_output = []
for area in area_tags:
area_dict = {}
area_dict['TITLE'] = area.get('title')
area_dict['ID'] = area.get('id')
area_dict['REL'] = area.get('rel')
area_dict['HREF'] = area.get('href')
area_dict['DATA_X_ABS'] = area.get('data-x-abs')
area_dict['DATA_Y_ABS'] = area.get('data-y-abs')
area_dict['TARGET'] = area.get('target')
area_dict['CLASS'] = area.get('class')
area_dict['SHAPE'] = area.get('shape')
coords = area.get('coords')
if coords:
coords_list = list(map(int, coords.split(',')))
area_dict['COORDS'] = coords_list
json_output.append(area_dict)
json_str = json.dumps(json_output, indent=4)
print(json_str)
build exe file
pip install pyinstaller
pyinstaller --add-data 'data;data' .\YourScript.py # with data folder
random integer
random.randint(1, 10)
draw centered fullscreen chessboard
import cv2
import numpy as np
image_width = 3840
image_height = 2400
img = np.zeros((image_height, image_width, 3), dtype=np.uint8)
square_size = image_height // 8
for y in range(8):
for x in range(8):
if (x + y) % 2 == 0:
x1 = (image_width//2-image_height//2) + x*square_size
y1 = y*square_size
x2 = x1+square_size
y2 = y1+square_size
img[y1:y2, x1:x2] = (255,255,255)
cv2.namedWindow('img', cv2.WINDOW_NORMAL)
cv2.setWindowProperty('img', cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
opencv - mask, crop, channels, stretch, colorize with lut
import cv2
import numpy as np
palette_image = cv2.imread('palette256.png')
palette_lut = palette_image[0,:,:]
frame = cv2.imread("frame.png")
height,width = frame.shape[:2]
(cx,cy),r = (321, 279),226 # for frame.png
names="red","green","blue",
g=[202,129,185] # range start per channel
a,b,c = [None]*3, [None]*3, [None]*3 # maak 3 lege lists met ruimte voor 3 items
def stretch(img,minv,maxv):
alpha = 255/(maxv-minv)
return cv2.convertScaleAbs(img, alpha=alpha, beta = -minv*alpha)
while(True):
mask = np.zeros((height, width), dtype=np.uint8)
cv2.circle(mask, (cx,cy), r, 255, -1)
masked = cv2.bitwise_and(frame, frame, mask=mask)
cropped = masked[cy-r:cy+r, cx-r:cx+r]
h,w,_ = cropped.shape
for i in range(3): # 3 channels
a[i] = cropped[:,:,i]
b[i] = stretch(a[i],g[i],255)
c[i] = palette_lut[b[i].astype(int)]
cv2.imshow(f"{names[i]}", a[i]); cv2.moveWindow(f"{names[i]}", i*w, 0)
cv2.imshow(f"{names[i]}_stretched", b[i]); cv2.moveWindow(f"{names[i]}_stretched", i*w, h)
cv2.imshow(f"{names[i]}_colored", c[i]); cv2.moveWindow(f"{names[i]}_colored", i*w, 2*h)
key = cv2.waitKey(1) & 0xFF
if key == 27:
break
elif key==ord('-'):
radius -= 1;
elif key==ord('='):
radius += 1;
elif key==ord('x'):
center[0] += 1;
elif key==ord('X'):
center[0] -= 1;
elif key==ord('y'):
center[1] += 1;
elif key==ord('Y'):
center[1] -= 1;
elif key==ord('r'):
g[0] += 1;
elif key==ord('R'):
g[0] -= 1;
elif key==ord('g'):
g[1] += 1;
elif key==ord('G'):
g[1] -= 1;
elif key==ord('b'):
g[2] += 1;
elif key==ord('B'):
g[2] -= 1;
elif key == ord(' '):
for i in range(3):
print(f"{names[i]}={g[i]}")
print("center",center,"radius",radius)
cv2.destroyAllWindows()
various pip install libraries
pip install opencv-python
jinja2
#!/usr/bin/env python3
import json,re,sys
from jinja2 import Environment, FileSystemLoader
env = Environment(loader=FileSystemLoader("templates"))
def ref(id):
return items_by_id[id]
def guid_from_id(id):
if not id:
return ""
return id[4:]
def dump(item):
if not item:
return ""
return "#"+json.dumps(item,indent=2).replace("\n","\n#")
def escape_key_name(s): # for ninja2/Liquid templates
s = re.sub(r"@","",s) # @id @type etc
s = re.sub(r".*:","",s) # remove prefix:
return s
def escape_key_names(item):
try:
result = { escape_key_name(x): v for x, v in item.items() } # renames keynames to use in template
except Exception as e:
print(item)
sys.exit(1)
return result
env.filters['guid'] = guid_from_id
env.filters['ref'] = ref
env.filters['dump'] = dump
data = json.load(open("data.json"))
items = data["@graph"][:4]
items = [escape_key_names(item) for item in items]
items_by_id = { item["id"]:item for item in items }
for item in items:
# print(item)
tmpl = env.get_template(item["type"].replace("aet:","")+".ttl")
print(tmpl.render(**item))
copy 1000 random files from a csv
#!/usr/bin/env python3
import shutil,os
import pandas as pd
df = pd.read_csv("../test-omgekeerde-kaartjes/output.csv")
# filter alle kaarten weg van waar niet zeker van is dat ze goed zijn
df = df[df['verkeerd_om'] == 'False']
# neem een random sample van 1000 stuks
# altijd dezelfde sample door de 'random_state=42'.
# hierdoor hoeft de output map niet eerst leeg gemaakt te worden
# mocht je dat wel willen dan kan dat met # shutil.rmtree("output/")
random_items = df.sample(n=1000, random_state=42)
# kopiëer 1000 eerder gecropte bestanden ('crops/boven') naar de output/ map
for index,item in random_items.iterrows():
input_path = item["filename"].replace("../","../crops/boven/")
rel_path = item["filename"].replace("../","")
output_folder = "output/" + os.path.dirname(rel_path)
basename = os.path.basename(rel_path)
# maak map aan indien nodig
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# kopiëren indien nodig
output_file_path = os.path.join(output_folder,basename)
print(input_path," >>> ", output_folder)
if not os.path.exists(output_file_path):
shutil.copy2(input_path, output_file_path)
incremental update of sqlite database from CSV file
#!/usr/bin/env python3
import csv,sqlite3
from tqdm import tqdm
import argparse
parser=argparse.ArgumentParser()
parser.add_argument("--csv", help="input csv file with format: ID,GUID", required=True)
args=parser.parse_args()
csv_file = args.csv
db_file = 'database.db'
table_name = 'guids'
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} (ID INTEGER, GUID TEXT)")
cursor.execute(f"CREATE INDEX IF NOT EXISTS idx_id ON {table_name} (ID)")
cursor.execute(f"CREATE INDEX IF NOT EXISTS idx_guid ON {table_name} (GUID)")
num_rows = sum(1 for _ in open(csv_file))
with open(csv_file, 'r') as file:
csv_data = csv.reader(file)
next(csv_data) # skip header
for i, row in tqdm(enumerate(csv_data, start=1), total=num_rows):
cursor.execute(f"REPLACE INTO {table_name} (ID, GUID) VALUES (?, ?)", (int(row[0]), row[1]))
conn.commit()
conn.close()
read csv with DictReader
rows = list(csv.DictReader(open("urls-lijst-van-picturae-verrijkt-met-sql.csv"), delimiter=","))
write (filtered) rows to Excel with pandas
#write result as Excel
excel_writer = pd.ExcelWriter("probleem-bestandskoppelingen.xlsx", engine='xlsxwriter')
df = pd.DataFrame(results)
df = df[df['info'] != 'ok'] # filter alles weg met 'ok' in de kolom 'info'
df.to_excel(excel_writer, sheet_name="Sheet1", startrow=1, header=False, index=False)
(max_row, max_col) = df.shape
sheet = excel_writer.sheets['Sheet1']
[sheet.set_column(i,i,w) for i,w in enumerate([20,10,50,10,12,50,10,10,50])] # set column widths
sheet.add_table(0, 0, max_row, max_col-1, {'columns': [{'header': col} for col in df.columns]})
excel_writer.close()
copy file and rebuild structure
def check(row): #check/process single file
rel_path = re.sub(r"X:[\\/]yyy[\\/]","",row["path"]).replace("\\","/")
abs_path = "/Volumes/xxx$/yyy/" + rel_path
file_exists = os.path.exists(abs_path) and os.path.isfile(abs_path)
if file_exists:
# kopieer bestand naar lokaal en maak directories aan
output_folder = "output/" + os.path.dirname(rel_path)
basename = os.path.basename(rel_path)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
output_file_path = os.path.join(output_folder,basename)
if not os.path.exists(output_file_path):
shutil.copy2(abs_path, output_file_path)
parallel
from joblib import Parallel, delayed
def check(row):
return row
results = Parallel(n_jobs=-1)(delayed(check)(row) for row in tqdm(rows))
print(results)
image check
from PIL import Image
# do image check
w = h = ""
if not file_exists:
image_info = "bestand niet gevonden"
elif os.path.getsize(abs_path)==0:
image_info = "bestand is leeg"
else:
try:
im = Image.open(output_file_path)
im.verify() #I perform also verify, don't know if he sees other types o defects
im.close() #reload is necessary in my case
im = Image.open(output_file_path)
im.transpose(Image.FLIP_LEFT_RIGHT)
im.close()
image_info = "ok"
w, h = im.size
except Exception as e:
image_info = f"fout in afbeelding: {e}"
passass
crop images
#!/usr/bin/env python3
import sys,os
from tqdm import tqdm
from pathlib import Path
from PIL import Image
folder_names = tqdm(["a","b","c"])
for folder in folder_names:
folder_names.set_description(f"{folder}")
images = sorted(list(Path(f"../xx/{folder}/").rglob("*.jpg")))
file_names = tqdm(images,leave=False)
for image in file_names:
filename = str(image)
file_names.set_description(f"{filename}")
img = Image.open(filename)
w, h = img.size
for naam,y1,y2 in [ ("boven",0,1500), ("onder", 1500,h)]:
output_filename = filename.replace("..", naam)
output_folder = os.path.dirname(output_filename)
if not os.path.exists(output_filename):
if y2>y1:
crop = img.crop((0, y1, w, y2))
else:
crop = img
os.makedirs(output_folder, exist_ok=True)
crop.save(output_filename)
load json
with open(input_file_path,"r") as file:
data = json.load(file)
ocr on Mac
pip3 install pyobjc-framework-Vision
pip3 install pyobjc-framework-Quartz
pip3 install wurlitzer
no subproces (not parallel):
import Quartz,Vision,json,csv
from Cocoa import NSURL
from Foundation import NSDictionary
from wurlitzer import pipes # needed to capture system-level stderr
def ocr(image_filename):
input_url = NSURL.fileURLWithPath_(image_filename)
with pipes() as (out, err):
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
(width,height) = input_image.extent().size
vision_options = NSDictionary.dictionaryWithDictionary_({})
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
input_image, vision_options
)
request = Vision.VNRecognizeTextRequest.alloc().init().autorelease()
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) #VNRequestTextRecognitionLevelFast
request.setRecognitionLanguages_(["nl-NL"])
error = vision_handler.performRequests_error_([request], None)
results = []
for item in request.results():
bbox = item.boundingBox()
w, h = bbox.size.width, bbox.size.height
x, y = bbox.origin.x, bbox.origin.y
results.append({
"x":int(x*width),
"y":int(height - y*height - h*height),
"w":int(w*width),
"h":int(h*height),
"conf":item.confidence(),
"text":item.text()
})
return results
in parallel:
#!/Applications/Xcode.app/Contents/Developer/usr/bin/python3
import config,json,os,subprocess,ocr2json
from tqdm import tqdm
from joblib import Parallel, delayed
import Quartz,Vision
from Cocoa import NSURL
from Foundation import NSDictionary
from wurlitzer import pipes # needed to capture system-level stderr
def ocr(image_filename):
input_url = NSURL.fileURLWithPath_(image_filename)
with pipes() as (out, err):
input_image = Quartz.CIImage.imageWithContentsOfURL_(input_url)
(width,height) = input_image.extent().size
vision_options = NSDictionary.dictionaryWithDictionary_({})
vision_handler = Vision.VNImageRequestHandler.alloc().initWithCIImage_options_(
input_image, vision_options
)
request = Vision.VNRecognizeTextRequest.alloc().init().autorelease()
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) #VNRequestTextRecognitionLevelFast
request.setRecognitionLanguages_(["nl-NL"])
error = vision_handler.performRequests_error_([request], None)
results = []
for item in request.results():
bbox = item.boundingBox()
w, h = bbox.size.width, bbox.size.height
x, y = bbox.origin.x, bbox.origin.y
results.append({
"x":int(x*width),
"y":int(height - y*height - h*height),
"w":int(w*width),
"h":int(h*height),
"conf":item.confidence(),
"text":item.text()
})
return results
def do_ocr(image_file_path, progress):
file_id = config.get_id_from_path(image_file_path)
json_file_path = config.make_path(base_folder=config.DEEDS_OCR_JSON_FOLDER, id=file_id, suffix=".json")
if not os.path.exists(json_file_path):
data = ocr(str(image_file_path))
if data and len(data)>0:
json.dump(data, open(json_file_path,"w"), indent=2)
print(json_file_path,str(int(progress*1000)/10.)+"%")
def run():
print("run ocr_per_akte")
if not os.path.exists(config.DEEDS_OCR_JSON_FOLDER):
os.mkdir(config.DEEDS_OCR_JSON_FOLDER)
file_paths = config.get_deeds_image_file_paths()
results = Parallel(n_jobs=1, prefer="threads")(
delayed(do_ocr)(image_file_path,i/len(file_paths))
for i,image_file_path in enumerate(file_paths)
)
if __name__ == '__main__':
run()
show all sqlite tables
res = cursor.execute("SELECT name FROM sqlite_master")
result = res.fetchone()
print(result)
parse number sequence
#40630837,"1939","165-172 : ill."
#40630838,"1996","4-8 : ill., plgr., tek."
#40630839,"1996","10-13 : portr."
#40630843,"1996","18"
#40631828,"1969","41-49"
#40631829,"1969","51-60"
#40631836,"1969","99-119"
#40631846,"1970","68-70"
#40631847,"1970","93-102 : ill."
#40632861,"1931","38-40"
#40632864,"1931","42-44; 57-59"
#40633811,"1943","1-3"
#40633813,"1943","4-7"
#40633814,"1943","9-13; 18"
import csv,re,sys
def get_number_list(last_column):
numbers = []
last_column = last_column.replace(", ","; ") # ,->;
last_column = last_column.replace(": ","; ") # :->;
for range_str in last_column.split(';'): # multiple numbers/sequences separated by ;
range_str = range_str.strip() # trim
range_str = range_str.replace("[","") # [num] means manually determined page number
range_str = range_str.replace("]","")
if '-' in range_str: # sequence
m = re.findall("(\d+)\s*-+\s*(\d+)",range_str)
if m and len(m)==1:
m=m[0]
if len(m)==2:
start = int(m[0])
end = int(m[1])
numbers.extend(range(start, end + 1))
else:
print("WARNING 1",last_column)
else:
print("WARNING 2",last_column)
else:
range_str = re.sub(r" .*","",range_str) # remove everything after first space (CHECKME!)
if range_str.isdigit(): # single number
numbers.append(int(range_str))
else:
print("WARNING 3",last_column, "HUIDIGE STUK:",range_str)
return numbers
with open('artikelen.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # skip header
for row in reader:
last_column = row[-1]
try:
numbers = get_number_list(last_column)
except Exception as e:
print(e,row);
print(row, numbers)
copy all files from textfile to folder without hierarchy / folder structure
import os
import shutil
from tqdm import tqdm
def copy_files_to_destination(file_path, destination):
with open(file_path, 'r') as file:
files = list(file.readlines())
for line in tqdm(files):
absolute_path = line.strip()
if os.path.isfile(absolute_path):
file_name = os.path.basename(absolute_path)
dest_path = os.path.join(destination, file_name)
shutil.copy(absolute_path, dest_path)
#print(f"Bestand gekopieerd: {absolute_path} naar {dest_path}")
source_file = 'xmls.txt'
destination_folder = 'xml/'
copy_files_to_destination(source_file, destination_folder)
Render AltoXML data to JPG for quality check
from pathlib import Path
import os,cv2
from alto import parse_file, String
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
# BASE_FOLDER = "xxxx 2010-2022 50119/"
BASE_FOLDER = "xxxx 2010-2022 14215/"
IMAGE_FOLDER = BASE_FOLDER+"JPG"
ALTO_FOLDER = BASE_FOLDER+"Alto-XML"
OUTPUT_FOLDER = BASE_FOLDER+"output"
def list_files(folder, scheme):
return list(Path(folder).rglob(scheme))
def list_image_files():
return list_files(IMAGE_FOLDER,"*.jpg")
def list_alto_files():
return list_files(ALTO_FOLDER,"*.xml")
def get_alto_file(file_id):
return Path(str(make_path(ALTO_FOLDER, file_id, ".xml")).replace(".xml","_alto.xml"))
def get_image_file(file_id):
return make_path(ALTO_FOLDER, file_id, ".jpg")
def get_output_file(file_id):
return make_path(OUTPUT_FOLDER, file_id, ".jpg")
def get_id_from_path(path):
return Path(os.path.basename(os.path.dirname(path))).joinpath(path.stem)
def make_path(base_folder, id, suffix):
file = Path(base_folder).joinpath(id).with_suffix(suffix)
file_folder = file.parent
file_folder.mkdir(parents=True, exist_ok=True)
return file
#######################
font = ImageFont.truetype("arial.ttf", 24)
for image_file in tqdm(list_image_files()[:5]):
img = Image.open(str(image_file))
draw = ImageDraw.Draw(img)
file_id = get_id_from_path(image_file)
alto_file = get_alto_file(file_id)
output_file = get_output_file(file_id)
alto = parse_file(str(alto_file))
for line in alto.extract_text_lines():
for word in line.strings:
if isinstance(word, String):
(x,y) = p1 = tuple(int(num) for num in (word.hpos,word.vpos))
(w,h) = tuple(int(num) for num in (word.width,word.height))
p2 = (x+w,y+h)
draw.rectangle((x, y, x+w, y+h), outline="blue")
draw.text((x, y+h-10), word.content, font=font, fill="red")
nieuwe_breedte = int(img.width * 0.5)
nieuwe_hoogte = int(img.height * 0.5)
img = img.resize((nieuwe_breedte, nieuwe_hoogte))
img.save(str(output_file))
Groeperen met Pandas
import pandas as pd
df = pd.read_excel('data/all-rows.xlsx')
df = df[df['CODE'].isin(["1202.xx","1202.xx",........])]
groepen = df.groupby(['Straatnaam', 'Huisnummer'])
groepen = groepen.filter(lambda x: len(x) > 1)
groepen = groepen.groupby(['Straatnaam', 'Huisnummer'])
with open("tmp.tsv","w") as out:
for naam, groep in groepen:
print(" ".join(naam),file=out)
for index, rij in groep.iterrows():
rij = rij.fillna('')
print(rij["ID"], rij["CODE"], " ", rij['Achternaam'], rij['Voorna(a)m(en)'], rij['Geboortedatum'], rij["xxxx"], rij["Overslaan in uitvoer"], rij["Externe Identifier"], rij["Bron overlijden"], rij["GUID"], sep='\t',file=out )
print("\n",file=out)
Download all prismic documents and images
#!/usr/bin/env python3
import os,re,requests,json,urllib,hashlib
from urllib.parse import urlparse
from collections import defaultdict
url = "https://XXXXXX.cdn.prismic.io/api/v2/documents/search?ref=XXXXXXXX"
while url:
response = requests.get(url)
data = response.json()
page = data.get("page")
results = data.get("results")
json.dump(data, open(f"page{page}.json","w"), indent=4)
for result in results:
for field in result["data"]:
if result["data"][field] and type(result["data"][field])==dict and "url" in result["data"][field]:
url = result["data"][field]["url"]
parsed_url = urlparse(url)
path = parsed_url.path
ext = os.path.splitext(path)[1]
filename = "images/" + hashlib.md5(url.encode()).hexdigest() + ext
print(field, filename)
if not os.path.exists(filename):
urllib.request.urlretrieve(url, filename) # save
url = data.get("next_page")
# break
file exists
if os.path.exists(filename):
voeg dashes toe aan GUID
def guid_dashed(s):
return "{" + (f"{s[:8]}-{s[8:12]}-{s[12:16]}-{s[16:20]}-{s[20:]}") + "}"
splits achternaam op tussenvoegsel en achternaam
# split achternaam / tussenvoegsel
for rol in ["overledene","vader","moeder","partner"]:
achternaam = item.get(f"achternaam {rol}")
patroon = re.compile(r"^(van der|van den|van de|van het|van 't|van|der|de)", re.IGNORECASE)
tussenvoegsel = patroon.findall(achternaam)
achternaam = patroon.sub("", achternaam)
if tussenvoegsel:
item[f"tussenvoegsel {rol}"] = (" ".join(tussenvoegsel)).lower()
item[f"achternaam {rol}"] = achternaam.strip()
re-create items based on expected keys with specific field order using list comprehension
expected_keys = ["filename", "x", "y" ] # etc
items = [{key: item[key] for key in expected_keys if key in item} for item in items]
# without list comprehension:
# new_items = []
# for item in items:
# new_item = {}
# for key in expected_keys:
# if key in item:
# new_item[key] = item[key]
# new_items.append(new_item)
zorg dat de letter A of B altijd gevolgd wordt door een spatie wanneer het direct gevolgd wordt door een cijfer.
re.sub(r"(A|B)(\d)", r"\1 \2", input_string)
fuzzy lookup for dates in different Dutch formats
dates = get_all_dates(datetime(1800,1,1))
lut = { get_dutch_date_with_dashes(date):date for date in dates }
lut |= { get_dutch_date_with_written_month(date):date for date in dates }
lut |= { get_dutch_date_fully_written(date):date for date in dates }
a = fuzzy_extract("5-10_1950t", lut.keys())
b = fuzzy_extract("5 oktber 1950", lut.keys())
c = fuzzy_extract("vijftn tober negentsnhonerd viftg", lut.keys())
print(lut[a[0]], a)
print(lut[b[0]], b)
print(lut[c[0]], c)
get all dates in a range as a list
with list comprehension and typed
def get_all_dates(start_date: datetime = datetime(1900, 1, 1), end_date: datetime = datetime.now()) -> List[datetime]:
return [start_date + timedelta(days=d) for d in range((end_date - start_date).days+1)]
without:
def get_all_dates(start_date=datetime(1900, 1, 1), end_date=datetime.now()):
dates = []
date = start_date
while date < end_date:
dates.append(date)
date += timedelta(days=1)
return dates
format Dutch dates
def get_dutch_date_with_dashes(date): # 05-12-1979 met voorloop nul
return date.strftime("%d-%m-%Y")
def get_dutch_date_written_month(date): # 5 december 1979 zonder voorloop nul bij dag
locale.setlocale(locale.LC_TIME, "nl_NL")
return date.strftime("%-d %B %Y")
def get_dutch_date_fully_written(date): #vijftien december negentienhonderd negenzeventig
locale.setlocale(locale.LC_TIME, "nl_NL")
numbers_str = ["één","twee","drie","vier","vijf","zes","zeven","acht","negen","tien","elf","twaalf","dertien","veertien","vijftien","zestien","zeventien","achttien","negentien","twintig","eenentwintig","tweeëntwintig","drieëntwintig","vierentwintig","vijfentwintig","zesentwintig","zevenentwintig","achtentwintig","negenentwintig","dertig","eenendertig","tweeëndertig","drieëndertig","vierendertig","vijfendertig","zesendertig","zevenendertig","achtendertig","negenendertig","veertig","eenenveertig","tweeënveertig","drieënveertig","vierenveertig","vijfenveertig","zesenveertig","zevenenveertig","achtenveertig","negenenveertig","vijftig","eenenvijftig","tweeënvijftig","drieënvijftig","vierenvijftig","vijfenvijftig","zesenvijftig","zevenenvijftig","achtenvijftig","negenenvijftig","zestig","eenenzestig","tweeënzestig","drieënzestig","vierenzestig","vijfenzestig","zesenzestig","zevenenzestig","achtenzestig","negenenzestig","zeventig","eenenzeventig","tweeënzeventig","drieënzeventig","vierenzeventig","vijfenzeventig","zesenzeventig","zevenenzeventig","achtenzeventig","negenenzeventig","tachtig","eenentachtig","tweeëntachtig","drieëntachtig","vierentachtig","vijfentachtig","zesentachtig","zevenentachtig","achtentachtig","negenentachtig","negentig","eenennegentig","tweeënnegentig","drieënnegentig","vierennegentig","vijfennegentig","zesennegentig","zevenennegentig","achtennegentig","negenennegentig"]
day = numbers_str[date.day-1]
century = int(date.year/100)
year_within_century = int(date.year%100)
century_str = numbers_str[century-1]
year_within_century_str = numbers_str[year_within_century-1] if year_within_century else ""
return (f"{day} {date.strftime('%B')} {century_str}honderd {year_within_century_str}").strip()
merge two dicts with '|' operator
datums = get_alle_datums_als_tekst(datetime(1800,1,1))
datums |= get_alle_datums(datetime(1800,1,1))
filter out keys that are not in expected_keys
data = {"filename": "x", "aktenummer": "y", "overlijdensdatum": "z", "achternaam overledene": "FILTERED OUT"}
expected_keys = ["filename", "aktenummer", "overlijdensdatum"]
data = dict(filter(lambda x: x[0] in expected_keys, data.items()))
or for multiple items:
items = [ dict(filter(lambda x: x[0] in expected_keys, item.items())) for item in items ]
call a remote function running with Flask
URL=http://URL
INPUT_FILE=INPUT.txt
OUTPUT_FILE=OUTPUT.json
curl -J -X POST -F "file=@$INPUT_FILE" $URL > $OUTPUT_FILE
remote function implementation on server with Flask
#!/usr/bin/env python3
from flask import Flask,request,send_file
import json
app = Flask(__name__)
@app.route("/")
def index():
return "ok"
@app.route('/run', methods=['POST'])
def run():
input_file_path = '/tmp/INPUT.txt'
output_file_path = '/tmp/OUTPUT.json'
f = request.files['file']
f.save(input_file_path)
with \
open(input_file_path, 'r', encoding="utf-8") as input_file, \
open(output_file_path, 'w') as output_file:
output = CONVERT_FILE_FUNCTION(input_file.read())
json.dump(output, output_file, indent=2, ensure_ascii=False)
result_file = open(output_file_path,'rb')
return send_file(result_file, as_attachment=True, download_name='result.json')
if __name__ == "__main__":
from waitress import serve
print("URL:PORT")
serve(app, host=URL, port=PORT)
harvest OAI-PMH and save as xml and json (using resumptionToken)
#!/usr/bin/env python3
import requests,json,xmltodict
base_url = "...........?verb=ListRecords"
url = base_url + "&metadataPrefix=oai_a2a&set=............"
page = 0
while url:
print(url)
response = requests.get(url)
with open(f"data/xml/page-{page}.xml","w") as f:
f.write(response.text)
d = xmltodict.parse(response.text)
json.dump(d,open(f"data/json/page-{page}.json","w"),indent=2)
resumption_token = d["OAI-PMH"]["ListRecords"]["resumptionToken"]["#text"]
if resumption_token:
url = base_url + "&resumptionToken=" + resumption_token
page += 1
else:
print("end?")
break
selected python version...
/Applications/Xcode.app/Contents/Developer/usr/bin/python3.9
print SSL version
import ssl; print(ssl.OPENSSL_VERSION)
get version of installed modules with pip
pip list
update module with pip
pip install --upgrade openai
list amount of each 'Soort' across different json files
from collections import defaultdict
from pathlib import Path
import json
soort = defaultdict(int)
input_files = list(Path("data/json").rglob("*.json"))
for input_file_path in input_files:
with open(input_file_path,"r") as file:
for item in json.load(file):
soort[item.get("Soort")] += 1
soort = dict(sorted(soort.items(), key=lambda x:x[1], reverse=True))
print(json.dumps(soort,indent=2,ensure_ascii=False))
according to ChatGPT this code can written shorter as, thanks:
import json
import pandas as pd
from pathlib import Path
input_files = list(Path("data/json").rglob("*.json"))
df = pd.concat([pd.read_json(file) for file in input_files])
soort = df["Soort"].value_counts().to_dict()
print(json.dumps(soort,indent=2,ensure_ascii=False))
sort dict by value descending
soort = dict(sorted(soort.items(), key=lambda x:x[1], reverse=True))
default dict with int instead of lambda:0
soort = defaultdict(int)
RDF / turtle / jsonLD
see RDF
Download multiple pages of JSON from (omeka-s) API
page = 1
url = f"{API_URL}&page={page}"
while url:
print(url)
response = requests.get(url)
json.dump(response.json(), open(f"data/page{page}.json","w"), indent=2)
next = response.links.get("next")
url = next["url"] if next else ""
page += 1
get next link url from http response header
response = requests.get(url)
next = response.links.get("next")
if next:
print(next["url"])
json dumps http request response header
import requests, json
response = requests.get(url)
print(json.dumps(dict(response.headers),indent=2))
item get
if "Achternaam" in item and item["Achternaam"]=="Bicker":
vs.
if item.get("Achternaam")=="Bicker":
remove double line breaks with optional whitespace in between
s = re.sub(r"\n\s*\n", "\n", s)
replace spaces by '_' from all keys in a dict
item = { x.replace(' ', '_'): v for x, v in item.items() }
pipx
pipx is a tool to help you install and run end-user applications written in Python. It's roughly similar to macOS's brew, JavaScript's npx, and Linux's apt.
It's closely related to pip. In fact, it uses pip, but is focused on installing and managing Python packages that can be run from the command line directly as applications. https://pypa.github.io/pipx/
tweepy
import tweepy,json
client = tweepy.Client("Bearer Token")
user_id=123123123
tweet_fields = ["attachments","author_id","context_annotations","conversation_id","created_at","entities","geo","id","in_reply_to_user_id","lang","referenced_tweets","reply_settings","source","text","withheld"]
tweets = []
# user tweets
# similar for client.get_liked_tweets
for tweet in tweepy.Paginator(client.get_users_tweets,id=user_id, tweet_fields=tweet_fields, max_results=100).flatten(limit=1500):
tweets.append(tweet.data)
with open('tweets.json', 'w') as f:
json.dump(tweets, f, indent=2)
parse and format date
# from: 2023-01-24T10:32:01+01:00
# to: 2023-01-24 10:32
datetime.datetime.strptime(date_str, "%d/%b/%Y:%H:%M:%S %z").strftime("%Y-%m-%d %H:%M")
parse access.log files to csv
see: https://coderwall.com/p/snn1ag/regex-to-parse-your-default-nginx-access-logs
#!/usr/bin/env python3
import json,re,csv,datetime
result = []
for line in open('all.log').readlines():
# if not re.findall(r'GET /\d{1,2} ', line): # filter on GET /1, GET /2 etc.
# continue
r = re.match(r'(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>.*)\] \"(?P<httpstatus>(GET|POST) .+ HTTP\/1\.1)\" (?P<returnstatus>\d{3} \d+) (\".*\")(?P<browserinfo>.*)\"',line)
if r != None:
result.append({'IP address': r.group('ipaddress'), 'Time Stamp':
datetime.datetime.strptime(r.group('dateandtime'), "%d/%b/%Y:%H:%M:%S %z").strftime("%Y-%m-%d %H:%M"),
'HTTP status': r.group('httpstatus'), 'Return status':
r.group('returnstatus'), 'Browser Info': r.group('browserinfo')})
# print(result)
with open('output.csv', 'w', encoding='utf8') as file:
writer = csv.DictWriter(file, result[0].keys())
writer.writeheader()
writer.writerows(result)
#with open('data.json', 'w') as fp:
# json.dump(result, fp, indent=2)
concat two lists
result = list1 + list2
json2csv (where json is an array of 'flat' objects)
#!/usr/bin/env python3
import sys,csv,json
if len(sys.argv)<3:
sys.exit(f"Usage: {sys.argv[0]} input.json output.csv")
with open(sys.argv[1]) as json_file, open(sys.argv[2],'w') as csv_file:
data = json.load(json_file)
all_keys = set(key for row in data for key in row.keys())
writer = csv.DictWriter(csv_file, all_keys)
writer.writeheader()
writer.writerows(data)
get all keys used in a list of dicts
all_keys = set()
for row in data:
for key in row.keys():
all_keys.add(key)
print(all_keys)
how to retrieve in Python multiple json files from a REST server when the next_page attribute is supplied by the server?
import requests, json
url = "YOUR_API_URL"
while url:
response = requests.get(url)
data = response.json()
page = data.get("page")
json.dump(data, open(f"page{page}.json","w"), indent=4)
url = data.get("next_page")
read formula from cells in Excel
from openpyxl import load_workbook
wb = load_workbook(filename = 'HUA-Sabine-Fuzzy-Matches-score-vanaf-50pct-19-april-2022.xlsx')
sheet = wb['Sheet1']
for row in range(1,10):
status = sheet.cell(row=row, column=1).value
install package with sudo as root
this way also for example the wwwdata user can use the package. not sure if it's safe.
sudo su
sudo pip3 install mypackage
fuzzy regex
match = regex.search(f"(negentienhonderd){{e<=3}}", item["text"], regex.BESTMATCH)
point finding functions
def get_points_left(point, points):
if points:
return list(filter(lambda p: p[0]<point[0], points))
def get_points_below(point, points):
if points:
return list(filter(lambda p: p[1]>point[1], points))
def get_points_left_below(point, points):
if points:
return get_points_left(point, get_points_below(point, points))
def closest_point(point, points):
if points:
return points[distance.cdist([point], points).argmin()]
#example:
p2 = closest_point(p1, get_points_left_below(p, points))
template matching with opencv
def match(img, template):
result = cv2.matchTemplate(img, template, cv2.TM_CCORR_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
lt = (x1,y1) = max_loc
rb = (x2,y2) = (x1+tw,y1+th)
return (lt,rb)
lt,rb = match(img, template)
cv2.rectangle(img, lt, rb, color=0, thickness=-1)
cv2.rectangle
cv2.rectangle(img, lt, rb, color=(b,g,r), thickness=10)
make a list unique
result = list(set(['a','b','a']))
# result = ['a','b']
create csv from a list of tuples
plaatsnamen = [("Utrecht",5), ("Amersfoort",2)]
writer = csv.writer(open("output.csv","w"))
writer.writerow(['plaatsnaam','aantal'])
writer.writerows(plaatsnamen)
sort dict
plaatsnamen = dict(sorted(plaatsnamen.items(), key=lambda item: item[1], reverse=True))
fuzzy match and convert a Dutch written/spelled date from the 20th century:
from datetime import datetime, timedelta
import sys
from rapidfuzz import fuzz
import rapidfuzz.process as fuzzy
def alle_datums_20e_eeuw_als_tekst(): # returns a dict {string:datetime}
maanden = ["januari","februari","maart","april","mei","juni","juli","augustus","september","oktober","november","december"]
getallen_1tm99 = ["één","twee","drie","vier","vijf","zes","zeven","acht","negen","tien","elf","twaalf","dertien","veertien","vijftien","zestien","zeventien","achttien","negentien","twintig","eenentwintig","tweeëntwintig","drieëntwintig","vierentwintig","vijfentwintig","zesentwintig","zevenentwintig","achtentwintig","negenentwintig","dertig","eenendertig","tweeëndertig","drieëndertig","vierendertig","vijfendertig","zesendertig","zevenendertig","achtendertig","negenendertig","veertig","eenenveertig","tweeënveertig","drieënveertig","vierenveertig","vijfenveertig","zesenveertig","zevenenveertig","achtenveertig","negenenveertig","vijftig","eenenvijftig","tweeënvijftig","drieënvijftig","vierenvijftig","vijfenvijftig","zesenvijftig","zevenenvijftig","achtenvijftig","negenenvijftig","zestig","eenenzestig","tweeënzestig","drieënzestig","vierenzestig","vijfenzestig","zesenzestig","zevenenzestig","achtenzestig","negenenzestig","zeventig","eenenzeventig","tweeënzeventig","drieënzeventig","vierenzeventig","vijfenzeventig","zesenzeventig","zevenenzeventig","achtenzeventig","negenenzeventig","tachtig","eenentachtig","tweeëntachtig","drieëntachtig","vierentachtig","vijfentachtig","zesentachtig","zevenentachtig","achtentachtig","negenentachtig","negentig","eenennegentig","tweeënnegentig","drieënnegentig","vierennegentig","vijfennegentig","zesennegentig","zevenennegentig","achtennegentig","negenennegentig"]
current_date = datetime(1900, 1, 1)
dagen = {}
while current_date.year < 2000:
dag = getallen_1tm99[current_date.day-1]
maand = maanden[current_date.month-1]
jaar = "negentienhonderd "+getallen_1tm99[current_date.year-1900-1]
dagen[f"{dag} {maand} {jaar}"] = current_date
current_date += timedelta(days=1)
return dagen
def fuzzy_extract(input_str, compare_strs): #(result, match_pct, idx)
return fuzzy.extractOne(input_str, compare_strs, scorer=fuzz.ratio)
datums = alle_datums_20e_eeuw_als_tekst()
#result,_,_ = fuzzy_extract("dertien januari negentienhonderd negenennegentig", datums.keys())
result,_,_ = fuzzy_extract("dertasdfien januaasdfri negeasdfntienahonderd negenenneasdfgentig", datums.keys())
print(result)
print(datums[result])
defaultdict in defaultdict (test)
all_first_names_and_occurences_dict = defaultdict(lambda: defaultdict(lambda: 0))
output_rows to excel using pandas
print("writing Excel file")
excel_writer = pd.ExcelWriter(config.EXCEL_OUTPUT_FILE, engine='xlsxwriter')
df = pd.DataFrame(output_rows)
df.to_excel(excel_writer, sheet_name="Sheet1", startrow=1, header=False, index=False)
excel_writer.sheets['Sheet1'].set_column(0,20, 25) # for cols 0 to 20 set width=25
(max_row, max_col) = df.shape
column_settings = [{'header': column} for column in df.columns]
excel_writer.sheets['Sheet1'].add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
excel_writer.close()
pandas groupby to tabs/sheets in Excel with table
df = pd.read_csv("overzicht.csv")
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
for aet, frame in df.groupby("AET"):
frame.to_excel(writer, sheet_name=aet, startrow=1, header=False, index=False)
column_settings = [{'header': column} for column in frame.columns]
(max_row, max_col) = frame.shape
writer.sheets[aet].add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
writer.close()
xlsxwriter with pandas
format with 'repr', 'str' or 'ascii'
print(f"{state!r}")
print(f"{state!s}")
print(f"{state!a}")
pandas groupby (count)
df = pd.read_csv("overzicht.csv") # sep=',', engine='python', header=0, usecols=["tip", "sex", "time"]), index_col=["sex", "tip"], nrows=5, skiprows = [1,12],
group_by_aet = df.groupby("AET")["CODE"].count()
print(group_by_aet)
trim items when splitting
input_string='ARP; CHU; KVP'
items = [x.strip() for x in input_string.split(';')]
# ['ARP', 'CHU', 'KVP'] instead of ['ARP', ' CHU', ' KVP']
oracle sql through jdbc
pip install cx_oracle
- and download the instant client
- open the dylibs with Terminal.app to Allow execution
import cx_Oracle
lib_dir = os.path.join(os.environ.get("HOME"), "Downloads", "instantclient_19_8")
cx_Oracle.init_oracle_client(lib_dir=lib_dir)
dsn = cx_Oracle.makedsn(IP_OR_HOSTNAME,'1521',service_name=SERVICE_NAME)
connection = cx_Oracle.connect(os.environ["USER"], os.environ["PASS"], dsn)
cur = connection.cursor()
for row in cur.execute("select * from TABLE;"):
print(row)
via dotenv:
import cx_Oracle
from dotenv import load_dotenv
load_dotenv()
def oracle_get_cursor():
lib_dir = os.path.join(os.environ["ORACLE_LIB_DIR"])
cx_Oracle.init_oracle_client(lib_dir=lib_dir)
dsn = cx_Oracle.makedsn(os.environ["ORACLE_IP"],'1521',service_name=os.environ["ORACLE_SERVICE_NAME"])
connection = cx_Oracle.connect(os.environ["ORACLE_USER"], os.environ["ORACLE_PASS"], dsn)
cur = connection.cursor()
return cur
query result as dictionary (single result):
def get_bestandsnaam(cur, id):
#cur.execute(f"SELECT * FROM bestanden WHERE id={id}") # NOT SAFE
cur.execute("""select * from bestanden where id = :id""", id=41590334) # MORE SAFE
cur.rowfactory = lambda *args: dict(zip([d[0].lower() for d in cur.description], args))
row = cur.fetchone()
return row["bestandsnaam"]
list all valid xnummers recursively
def listdir_valid_xnummers_to_csv():
folderroot = "/Volumes/SHARE$/FOLDER"
xnummers = []
for path in Path(folderroot).rglob('*.jpg'):
r = re.findall(r"(?:X)(\d+)",path.name)
if r:
path_without_root = str(path).replace(folderroot,'') # let op! str(path) !
xnummers.append({"nummer":int(r[0]), "path":path_without_root})
xnummers = sorted(xnummers, key=lambda item:item['nummer'])
writer = csv.DictWriter(open("all-valid-xnummers.csv","w"), fieldnames=["nummer","path"])
writer.writeheader()
writer.writerows(xnummers)
pathlib / os.path
import os
from pathlib import Path
for path in Path(folderroot).rglob('*.jpg'):
folder = os.path.basename(os.path.dirname(path))
print(folder, path.name)
try except
try:
1 / 0
except Exception as e:
print(e)
with traceback:
import traceback
traceback.print_exc()
named tuples
from collections import namedtuple
Point = namedtuple('Point', ['x', 'y'])
#Rect = namedtuple('Rect', ['x', 'y', 'w', 'h'])
a = Point(1,2)
print(a.x)
list to dict
dict(zip(['x','y','w','h'], get_bounds(box_points)))
cast array to namedTuple
"You can do Row(*A) which using argument unpacking." source
from collections import namedtuple
input_array = [1705, 155, 106, 38]
Rect = namedtuple('Rect', ['x', 'y', 'w', 'h'])
bounds = Rect(*input_array)
print(bounds)
# output: Rect(x=1705, y=155, w=106, h=38)
ensure ascii=False
print(json.dumps(data,indent=2,ensure_ascii=False))
liquid templates
pip install liquidpy
from liquid import Liquid
tpl = Liquid("templates/template.xml", liquid_from_file=True)
data = {
"ID": item["index"],
"ParentID": item["parentIndex"],
"Text": item["text"]
}
result = tpl.render(**data)
print(result)
custom filters in liquidpy 'wild' mode
{% addfilter md5 %}
import hashlib
def md5(s):
return hashlib.md5(s.encode()).hexdigest()
{% endaddfilter %}
parse macOCR result
def read_ocr(filename): # txt macOCR format
for raw_line in open(filename).readlines():
line_tuple = make_tuple(raw_line)
yield({
"box": [tuple([int(coordinate) for coordinate in coordinates]) for coordinates in line_tuple[0]],
"text": line_tuple[1],
"confidence": line_tuple[2]
})
ocr_items = list(read_ocr(INPUT_MACOCR))
ignore 'illegal' unicode chars in string (?)
str(data["Text"].encode("ascii", "ignore"))
sort by integer without itemgetter but wit lambda function
relaties = sorted(relaties, key=lambda item:int(item['aantal']), reverse=True)
strip trailing numbers comma and whitespace
s = s.strip().lower().rstrip(" 1234567890,")
download and save image
urllib.request.urlretrieve(URL, local_filename)
markup html with BeautifulSoup
https://beautiful-soup-4.readthedocs.io/en/latest/
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())
get all links:
for link in soup.find_all('a'):
print(link.get('href'))
get text of div with class:
copyright = soup.find("div", class_="copyright-authoryear").get_text().strip()
which python uses sublime?
#!/usr/bin/env python3
import sys
print(sys.executable)
sys.exit()
in my case:
/usr/local/opt/python@3.10/bin/python3.10
use pip for specific python version
/usr/local/opt/python@3.10/bin/python3.10 -m pip install bs4
requests
import requests, json
response = requests.get(URL)
# response = requests.get(URL, headers={"Accept":"application/ld+json"})
data = response.json()
print(json.dumps(data,indent=2))
on ubuntu when pip python packages end up in ~/.local/lib/python3.6/site-packages/
change user to root before pip install
sudo su
list used packages in Python script with versions
import pkg_resources
for i in pkg_resources.working_set:
print(i.key + "==" + i.version)
list all files recursive
#!/usr/bin/env python3
from pathlib import Path
folder = "/Volumes/archiefbestanden$/Kadaster/DVD's/"
for path in Path(folder).rglob('*'):
print(path)
join list to string ignore blanks
' '.join(filter(None, strings))
set label for tqdm progress bar
pbar = tqdm(straten) # list
for i,straat in enumerate(pbar):
straatnaam = straat["straatnaam"]
pbar.set_description(f"{i} {straatnaam}")
geo coordinaat opvragen van (het middelpunt van) een straat via BAG openbare_ruimte_ID op wikidata.
query="""SELECT ?straat ?straatLabel ?punt ?woonplaats ?woonplaatsLabel WHERE {
?straat wdt:P625 ?punt .
?straat wdt:P131 ?woonplaats .
?straat wdtn:P5207 <http://bag.basisregistraties.overheid.nl/bag/id/openbare-ruimte/"""+straat_id+"""> .
SERVICE wikibase:label { bd:serviceParam wikibase:language "nl". } }"""
url = "https://query.wikidata.org/sparql"
response = requests.get(url, params={'query' : query}, headers={'Accept' : 'application/sparql-results+json'})
data = response.json()
read csv as dict etc
def csv2dict(filename):
return csv.DictReader(open(csv_folder+filename, encoding="utf-8-sig"), delimiter=";")
def csv2lut(filename):
return { row["Unieke waarde (niet aanpassen)"]:row["Correctie (hier corrigeren)"]
for row in csv2dict(filename) }
split geo point
def split_geo_point(s):
parts = s.lower().split(" ")
lat = parts[1].replace(")","")
lon = parts[0].replace("point(","")
return (lat,lon)
urlencode
print(urllib.parse.quote_plus(query))
write stderr to file
sys.stderr = open("stderr.log","w")
total seconds from time
# must be a better way...
dt = datetime.strptime(film["STARTTIJD"][:8],"%H:%M:%S").time()
startOffset = dt.hour * 3600 + dt.minute * 60 + dt.second
namedtuple
Point = namedtuple('Point', 'x y')
center = Point(10,10) # center is a namedtuple, that can be accessed either using x and y or an index (0,1)
print(center.x)
parse points and get bounds
def get_points(points_string):
return [tuple(map(int,coords.split(','))) for coords in points_string.split(" ")]
def get_bounds(points):
x = min(points)[0], max(points)[0]
y = min(points)[1], max(points)[1]
return ((x[0],y[0],x[1],y[1]))
def is_overlapping_1D(line1, line2): # line: (xmin, xmax)
return line1[0] <= line2[1] and line2[0] <= line1[1] # box (xmin, ymin, xmax, ymax)
def is_overlapping_2d(box1, box2):
return is_overlapping_1D([box1[0],box1[2]],[box2[0],box2[2]]) and is_overlapping_1D([box1[1],box1[3]],[box2[1],box2[3]])
points_string = "93,1349 93,1502 162,1502 162,1349"
print(get_points(points_string))
get duration of films in folder
#!/usr/bin/env python3
import json,csv,re,glob,os,tqdm
import datetime
from time import strftime
from time import gmtime
from os.path import exists
from pathlib import Path
from tqdm import tqdm
def get_duration_ffprobe(filename):
import subprocess, json
result = subprocess.check_output(
f'ffprobe -v quiet -show_streams -select_streams v:0 -of json "{filename}"',
shell=True).decode()
fields = json.loads(result)['streams'][0]
return fields['duration']
# GET DURATION OF ALL FILMS
movie_folder = "MOVIE_FOLDER/"
files = glob.glob(movie_folder+"*.mp4", recursive=False)
writer = csv.DictWriter(open("film-duration.csv","w"), fieldnames=["name","duration"]) #, delimiter=',', quoting=csv.QUOTE_ALL, dialect='excel')
writer.writeheader()
for filename in tqdm(files):
row = {}
row["name"] = os.path.basename(filename)
row["duration"] = strftime("%H:%M:%S:00", gmtime(float(get_duration_ffprobe(filename))))
writer.writerow(row)
difflib
https://docs.python.org/3/library/difflib.html (tip van Lars)
fuzzy matching in strings
https://gist.github.com/companje/93f6061629ac27a2027a77888effd6ad
yield
The yield statement suspends function’s execution and sends a value back to the caller, but retains enough state to enable function to resume where it is left off. When resumed, the function continues execution immediately after the last yield run. This allows its code to produce a series of values over time, rather than computing them at once and sending them back like a list. https://www.geeksforgeeks.org/use-yield-keyword-instead-return-keyword-python/
def x():
for i in [5,4,2]:
yield(i)
for i in x():
print(i)
uuid / guid
import uuid
print(uuid.uuid4().hex)
string padding
mystring.rjust(10, '0'))
any
contains_thans = any(thans_spelling in raw_country_str for thans_spelling in ["thans", "th.", "th "])
usage
import sys,os
from sys import argv
if len(argv)!=3:
sys.exit("Usage: "+os.path.basename(argv[0])+" {INPUT_CSV} {OUTPUT_FILE}")
input_filename = argv[1]
output_filename = argv[2]
unieke waarden per kolom in csv
parallel processing
from joblib import Parallel, delayed
def DoSomething(filename, param2, param2):
#...
return result
results = Parallel(n_jobs=8)(
delayed(DoSomething)(filename, param2, param3)
for filename in tqdm(filenames)
)
get number of processor cores
import psutil as psutil
print(psutil.cpu_count(logical=True))
progress bar
from tqdm import tqdm
#...
for filename in tqdm(filenames)
#...
request
from urllib.request import urlopen
import urllib.parse
#...
request = urlopen(url)
data = json.load(request)
json.dump(data, open(cache_filename,"w"), indent=4)
defaultdict with numbers
aets = defaultdict(lambda: 0)
for ....
aets[row["CODE"]] += 1
serial
#!/usr/bin/env python3
# on Sanyo: type file.asm > aux
import serial
ser = serial.Serial('/dev/tty.usbmodem1301',1200)
while True:
x = ser.read()
print(x.decode('ascii'), end="")
ser.close()
parse and format date
import datetime
try:
isodate = datetime.datetime.strptime(datum, '%d-%m-%Y').strftime('%Y-%m-%d')
except ValueError:
pass # just skip invalid/incomplete dates
maak spreadsheet met velden als kolommen per eenheid
#!/usr/bin/env python3
import csv,re
from collections import defaultdict
filename = "alle-personen.csv"
output_filename = "output.csv"
fixed = ["ID","GUID","CODE","BESTANDSNAAM"]
flex_key = "PROMPT"
flex_value = "WAARDE"
header = fixed.copy()
items = defaultdict(dict)
for row in csv.DictReader(open(filename)):
row["WAARDE"] = row["WAARDE"].replace("\n"," ") # replace line breaks by spaces
# create or get item
item = items[row["ID"]]
# add fixed fields
for k,v in row.items():
if k in fixed:
item[k] = v
# add flex fields
item[row[flex_key]] = row[flex_value]
# update header
header.append(row[flex_key]) if row[flex_key] not in header else None
# output to csv
writer = csv.DictWriter(open(output_filename,"w"), fieldnames=header) #, delimiter=',', quoting=csv.QUOTE_ALL, dialect='excel')
writer.writeheader()
writer.writerows(items.values())
replace broken words based on lookup table
#!/usr/bin/env python3
import re,csv
from collections import defaultdict
import os.path
reader = csv.DictReader(open("Gekke tekens.csv"), delimiter=',')
lijst = [dict(d) for d in reader]
for line in open("vreemde_tekens2-gemaakt-via-grep-commando.txt").readlines():
filepath = "../test.documentatie.org-met-lfs/test.documentatie.org/data/wp/"+line.split(':')[0].strip()
basename = os.path.basename(filepath)
if not os.path.isfile(filepath):
print("NOT FOUND",filepath)
continue
with open(filepath) as infile:
data = infile.read()
for li in lijst:
data = data.replace(li["fout"],li["gecorrigeerd"])
with open("tmp/"+basename,'w',encoding="utf-8") as outfile:
outfile.write(data)
find broken characters with context
#!/usr/bin/env python3
import re,csv
from collections import defaultdict
from operator import itemgetter
results = defaultdict(list)
for line in open("vreemde_tekens2-gemaakt-via-grep-commando.txt").readlines():
filepath = line.split(':')[0].strip()
data = "".join(line.split(':')[1:])
matches = re.findall(r'([a-zA-Z�]+)(�)([a-zA-Z�]+)',data)
for m in matches:
s = "".join(m)
results[s].append("http://test.documentatie.org/data/wp/"+filepath)
writer = csv.writer(open("result.csv","w"))
for k,v in results.items():
k2 = re.sub(r"�","",k)
writer.writerow([k,k2,v[0],len(v)])
find hexstring in files recursively
#!/usr/bin/env python3
import glob, re
files = glob.glob('../test.documentatie.org-met-lfs/test.documentatie.org/data/wp/**/*.htm*', recursive=True)
for filename in files:
with open(filename, "rb") as f:
f1 = re.search(b'\xEF\xBF\xBD', f.read())
if f1:
print(filename,"\t",f1)
combine 2 cvs (database tables) to 1 json file with hierarchy
#!/usr/bin/env python3
import csv, json
uitvoergegevens = { row["ID"]:row for row in csv.DictReader(open("uitvoergegevens.csv", encoding="cp1252")) }
uitvoervelden = { row["ID"]:row for row in csv.DictReader(open("uitvoervelden.csv", encoding="cp1252")) }
for row in uitvoervelden.values():
ugn = uitvoergegevens[row["UGN_ID"]]
if "uitvoervelden" not in ugn:
ugn["uitvoervelden"] = []
else:
ugn["uitvoervelden"].append(row)
json.dump(uitvoergegevens, open("result.json", "w"), indent=2)
read CSV file as dictionary with primary key
uitvoergegevens = { row["ID"]:row for row in csv.DictReader(open("uitvoergegevens.csv", encoding="cp1252")) }
uitvoervelden = { row["ID"]:row for row in csv.DictReader(open("uitvoervelden.csv", encoding="cp1252")) }
csv2xlsx - csv to excel
#!/usr/bin/env python3
# source: https://stackoverflow.com/questions/17684610/python-convert-csv-to-xlsx
import os
import glob
import csv
from xlsxwriter.workbook import Workbook
for csvfile in glob.glob(os.path.join('.', '*.csv')):
workbook = Workbook(csvfile[:-4] + '.xlsx')
worksheet = workbook.add_worksheet()
with open(csvfile, 'rt', encoding='utf8') as f:
reader = csv.reader(f)
for r, row in enumerate(reader):
for c, col in enumerate(row):
worksheet.write(r, c, col)
workbook.close()
python csv 'list' reader
file = open("file.csv")
reader = csv.reader(file)
inline print for each result
[ print(result) for result in results ]
get key and value for the items in a dict
for k,v in scores.items():
print(k,v)
get the object with the max value in a dict
model = max(scores, key = scores.get)
# equivalent to:
model = max(scores, key = lambda k : scores.get(k))
get filename from path
file_name = os.path.basename(file_path)
recursive filelist by type
file_paths = glob.glob(folder + '/**/*.jpg', recursive=True)
semi transparent rectangle opencv python
def rechthoek(img, leftTop, rightBottom, color=(255,255,0), opacity=.2):
(x1, y1) = leftTop
(x2, y2) = rightBottom
sub_img = img[y1:y2, x1:x2]
white_rect = np.ones(sub_img.shape, dtype=np.uint8)
white_rect[:,:] = color
res = cv2.addWeighted(sub_img, 1-opacity, white_rect, opacity, 1.0)
img[y1:y2, x1:x2] = res
problem with scipy dependency when installing easyocr through pip
it helped to install scipy using brew instead of pip
brew install scipy
pip install easyocr
which python?
ls -l /usr/local/bin/python3
ls -l /usr/local/bin/python
create image tiles / sprite sheet with python, opencv and numpy
import cv2
import numpy as np
margin=5
cols=5
rows=10
stamp = cv2.imread("50p.jpeg")
stamp_h,stamp_w = stamp.shape[:2]
height=stamp_h*rows + (margin*rows+1)
width=stamp_w*cols + (margin*cols+1)
img = np.ones((height,width,3), np.uint8) * 255 # white
for row in range(0,rows):
for col in range(0,cols):
print(row,col)
x=col*stamp_w + (col+1)*margin
y=row*stamp_h + (row+1)*margin
img[y:y+stamp_h,x:x+stamp_w] = stamp
cv2.imwrite("sheet.jpg", img)
cv2.namedWindow("test",cv2.WINDOW_NORMAL)
cv2.setWindowProperty("test",cv2.WND_PROP_FULLSCREEN,cv2.WINDOW_FULLSCREEN)
cv2.setWindowProperty("test",cv2.WND_PROP_FULLSCREEN,cv2.WINDOW_NORMAL)
cv2.imshow("test", img)
cv2.waitKey()
create an empty image
import cv
import numpy as np
height=1000
width=1000
img = np.zeros((height,width,3), np.uint8)
img[:,0:width//2] = (255,0,0) # (B, G, R)
img[:,width//2:width] = (0,255,0)
cv2.imshow("test", img)
cv2.waitKey()
info about current python and system settings
import sys
print(sys.version)
print(sys.executable)
print(sys.path)
enumerate
for img_index, img_file_name in enumerate(img_file_names):
dictreader
f = open(f"data/file.csv", "r", encoding="utf-8-sig")
reader = csv.DictReader(f, delimiter=',')
data = [dict(d) for d in reader]
f.close()
defaultdict
from collections import defaultdict
my_dict = defaultdict(lambda: 0) # maakt een dict met default waarde 0 aan voor items
other_dict = defaultdict(list) # maakt een dict met default waarde een list []
tuple
from ast import literal_eval as make_tuple
(coords, text, conf) = make_tuple("[([[22, 15], [373, 15], [373, 89], [22, 89]], 'Aaldering =', 0.46745234890467213)]")
tuples!
https://www.studytonight.com/python/tuples-in-python
unpack values from csv row into variables
#!/usr/bin/env python3
import json,csv,sys,glob
from sys import argv
with open("TESTSERVER-aktenummers-bestandsnamen.csv") as f:
reader = csv.DictReader(f) #"CODE","ID","WAARDE","BESTANDSNAAM"
for row in reader:
(code, id, aktenummer, bestandsnaam) = row.values()
print(code,id,aktenummer,bestandsnaam)
join
' '.join(filter(None,[item["fname"],item["prefix"],item["sname"]])) + # filter(None,..) is to get rid of double spaces
dictreader with custom delimiter and fieldnames
reader = csv.DictReader(f, delimiter=";", fieldnames=["fname","prefix","sname","bdate","publish","file_id","remark"])
filter
filteredList = filter(lambda item: item["publish"] == "ja", list(reader))
sort with 'itemgetter'
from operator import itemgetter
#...
sortedList = sorted(filteredList, key=itemgetter('sname'))
map
lines = map(lambda item: item["name"], sortedList)
utf-8 with BOM
print(u'\ufeff',end='') # write UTF8 BOM signature without linebreak
recursive findall using xPath .//
for textline in xml.findall('.//TextLine'):
#...
BoundingBox
found here: https://techoverflow.net/2017/02/23/computing-bounding-box-for-a-list-of-coordinates-in-python/ and fixed a bug.
class BoundingBox(object):
def __init__(self, points):
self.minx, self.miny = float("inf"), float("inf")
self.maxx, self.maxy = float("-inf"), float("-inf")
for x, y in points:
self.minx = min(x,self.minx)
self.maxx = max(x,self.maxx)
self.miny = min(y,self.miny)
self.maxy = max(y,self.maxy)
@property
def width(self):
return self.maxx - self.minx
@property
def height(self):
return self.maxy - self.miny
def __repr__(self):
return "BoundingBox(minX={}, minY={}, maxX={}, maxY={})".format(
self.minx, self.miny, self.maxx, self.maxy)
DictWriter
writer = csv.DictWriter(sys.stdout, fieldnames=["id","image", "etc..."], delimiter=',', quoting=csv.QUOTE_ALL, dialect='excel')
writer.writeheader()
#...
writer.writerow(item)
split / map / list comprehension
# input: 2737,1248 2787,1256 ...
# split by space and comma
coords = [coord.split(",") for coord in coords.split(" ")]
# output: [['2737', '1248'], ['2787', '1256'],...]
# cast to int
coords = [(int(float(a)), int(float(b))) for a,b in coords]
# output: [(2737, 1248), (2787, 1256), ...]
PageXML to CSV
read from multiple files supplied as arguments
for example ./script.py *.JSON
#!/usr/bin/env python3
import json,csv,sys,glob
from sys import argv
for filename in argv[1:]:
with open(filename) as json_file:
data = json.load(json_file)
#...
get text after last /
str.rpartition("/")[-1]
from CSV to JSON without pandas
#!/usr/bin/env python
import csv
import json
data = []
with open("INPUT.csv") as f:
reader = csv.DictReader(f, delimiter=";")
for row in reader:
data.append(row)
with open("OUTPUT.json", "w") as f:
json.dump(data, f, indent=4)
enumerate for loop
for i, col in enumerate(header):
print(i,col)
regex
import re
s = re.sub(r"[\"\n\r\\]", "", s) # strip double quotes "
s = re.sub(r"<[^>]*>", "", s); # strip pseudo html tags
join
print("; ".join(item) + " .")
Python 3 as default on Mac
See: https://opensource.com/article/19/5/python-3-default-mac
alias python=/usr/local/bin/python3
alias pip=/usr/local/bin/pip3
Guide to Python function decorators=
http://thecodeship.com/patterns/guide-to-python-function-decorators/
4 interessante stukjes Python
def logUser(name, age, length):
print('name: ', name)
print('age: ', age)
print('length: ', length)
#logUser('alex', 29, 1.75)
user = ('alex', 29, 1.75)
logUser(*user)
def logUser(name, age, length):
print('name: ', name)
print('age: ', age)
print('length: ', length)
# logUser(age=28, length=1.75, name='alex')
user = {'age':28, 'length':1.75, 'name':'alex'}
logUser(**user)
def log(*args):
print(args)
message = ""
for item in args:
message += item+' ';
print(message)
log('hallo', '2134', 'fsfdsf')
def logUser(**kwargs):
print('name: ', kwargs['name'])
print('age: ', kwargs['age'])
print('length: ', kwargs['length'])
logUser(age=28, length=1.75, name='alex')
SimpleHTTPServer
python -m SimpleHTTPServer 8000
kivy
ImportError: No module named NK.gui.app
In NinjaKittens folder:
export PYTHONPATH=.
gui
libxml / libxml2 / lxml
untested:
brew install libxml2
or
sudo port install py25-lxml
install easy_install
http://pypi.python.org/pypi/setuptools#downloads
sh setuptools-0.6c9-py2.4.egg --prefix=~
Mogelijke oplossing voor problemen met Python op OSX Lion
- http://www.niconomicon.net/blog/2011/10/09/2120.wrestling.python.snow.leopard
- http://stackoverflow.com/questions/6886578/how-to-install-pycairo-1-10-on-mac-osx-with-default-python
version
python --version
location of python
type python
or
which python
info about executable
file /usr/local/bin/python
set python path (for macports?)
see also: [[macports]]
export PYTHONPATH=/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages