Spaces:

onomatopoeia12
/

word-as-image-api

Paused

App Files Files Community

KingHacker9000 commited on Jun 26

Commit

5e3465d

1 Parent(s): 186b47c

Re-commit with fonts & image under Git LFS

Browse files

Files changed (44) hide show

.gitattributes +2 -0
.gitignore +5 -0
=0.29 +168 -0
Dockerfile +38 -0
LICENSE +437 -0
README.md +160 -10
code/__init__.py +0 -0
code/bezier.py +122 -0
code/config.py +104 -0
code/config/base.yaml +46 -0
code/data/fonts/Bell MT.ttf +3 -0
code/data/fonts/DeliusUnicase-Regular.ttf +3 -0
code/data/fonts/HobeauxRococeaux-Sherman.ttf +3 -0
code/data/fonts/IndieFlower-Regular.ttf +3 -0
code/data/fonts/JosefinSans-Light.ttf +3 -0
code/data/fonts/KaushanScript-Regular.ttf +3 -0
code/data/fonts/LuckiestGuy-Regular.ttf +3 -0
code/data/fonts/Noteworthy-Bold.ttf +3 -0
code/data/fonts/Quicksand.ttf +3 -0
code/data/fonts/Saira-Regular.ttf +3 -0
code/data/init/KaushanScript-Regular_B.svg +5 -0
code/data/init/KaushanScript-Regular_BUNNY.svg +14 -0
code/data/init/KaushanScript-Regular_BUNNY_scaled.svg +11 -0
code/data/init/KaushanScript-Regular_B_scaled.svg +7 -0
code/data/init/KaushanScript-Regular_N.svg +5 -0
code/data/init/KaushanScript-Regular_N_scaled.svg +7 -0
code/data/init/KaushanScript-Regular_U.svg +5 -0
code/data/init/KaushanScript-Regular_U_scaled.svg +7 -0
code/data/init/KaushanScript-Regular_Y.svg +5 -0
code/data/init/KaushanScript-Regular_Y_scaled.svg +7 -0
code/generate.py +26 -0
code/losses.py +193 -0
code/main.py +188 -0
code/save_svg.py +155 -0
code/ttf.py +265 -0
code/utils.py +221 -0
coming_soon.png +3 -0
images/HobeauxRococeaux-Sherman_NATURE_T.svg +51 -0
images/KaushanScript-Regular_BUNNY_Y.svg +45 -0
images/teaser.png +3 -0
requirements.txt +148 -0
rest_api.py +22 -0
run_word_as_image.sh +25 -0
wai_service.py +68 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+TOKEN
+WriteToken
+diffvg/
+output/
+__pycache__/

=0.29 ADDED Viewed

@@ -0,0 +1,168 @@
                                                     [A
                                                     [A[A
                                                     [A[A[A
                                                     [A[A[A[A
                                                     [A[A[A[A[A done

+Channels:
+ - conda-forge
+ - defaults
+Platform: linux-64
+Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
+Solving environment: | / - done
+## Package Plan ##
+  environment location: /home/ashish/miniconda3/envs/word
+  added / updated specs:
+    - diffusers
+The following packages will be downloaded:
+    package                    |            build
+    ---------------------------|-----------------
+    diffusers-0.30.3           |     pyhd8ed1ab_0         711 KB  conda-forge
+    huggingface_hub-0.26.5     |     pyhd8ed1ab_0         268 KB  conda-forge
+    libabseil-20240116.2       | cxx17_he02047a_1         1.2 MB  conda-forge
+    libprotobuf-4.25.3         |       he621ea3_0         2.8 MB
+    pytorch-2.3.0              |cpu_py38h08bb5f6_1        73.3 MB
+    typing_extensions-4.12.2   |     pyha770c72_0          39 KB  conda-forge
+    ------------------------------------------------------------
+                                           Total:        78.4 MB
+The following NEW packages will be INSTALLED:
+  huggingface_hub    conda-forge/noarch::huggingface_hub-0.26.5-pyhd8ed1ab_0
+  libabseil          conda-forge/linux-64::libabseil-20240116.2-cxx17_he02047a_1
+  libprotobuf        pkgs/main/linux-64::libprotobuf-4.25.3-he621ea3_0
+  pytorch            pkgs/main/linux-64::pytorch-2.3.0-cpu_py38h08bb5f6_1
+  typing_extensions  conda-forge/noarch::typing_extensions-4.12.2-pyha770c72_0
+The following packages will be UPDATED:
+  diffusers               pypi/pypi::diffusers-0.8.0-pypi_0 --> conda-forge/noarch::diffusers-0.30.3-pyhd8ed1ab_0
+Proceed ([y]/n)?
+Downloading and Extracting Packages: ...working...
                                                     [A
                                                     [A[A
                                                     [A[A[A
                                                     [A[A[A[A
                                                     [A[A[A[A[A done
+Preparing transaction: | / - done
+Verifying transaction: | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
+Executing transaction: - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+# ───────────────────────────────────────────────────────────────
+# Word-As-Image - FastAPI GPU Space
+# ───────────────────────────────────────────────────────────────
+FROM python:3.10-slim
+# —— OS packages needed for compiling diffvg & running FastAPI ——
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential git cmake ffmpeg \
+        libgl1-mesa-glx libglib2.0-0 libpng-dev libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# —— Non-root user (matches HF best practice) ——
+RUN useradd -m -u 1000 appuser
+USER appuser
+WORKDIR /app
+# —— Keep Hugging Face & Torch caches between container restarts ——
+ENV HF_HOME=/home/appuser/.cache/huggingface \
+    TRANSFORMERS_CACHE=$HF_HOME \
+    TORCH_HOME=$HF_HOME/torch \
+    PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+# —— Python deps ——
+COPY --chown=appuser requirements.txt .
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt && \
+    # diffvg must be built from source
+    pip install --no-cache-dir git+https://github.com/BachiLi/diffvg.git
+# —— Project code ——
+COPY --chown=appuser . /app
+# —— Expose the FastAPI port ——
+EXPOSE 7860
+# —— Start the API (Swagger UI auto-appears at /docs) ——
+CMD ["uvicorn", "rest_api:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md CHANGED Viewed

@@ -1,10 +1,160 @@
----
-title: Word As Image Api
-emoji: 🏃
-colorFrom: blue
-colorTo: yellow
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Word-As-Image for Semantic Typography (SIGGRAPH 2023 - Honorable Mention Award)
+[![arXiv](https://img.shields.io/badge/📃-arXiv%20-red.svg)](https://arxiv.org/abs/2303.01818)
+[![webpage](https://img.shields.io/badge/🌐-Website%20-blue.svg)](https://wordasimage.github.io/Word-As-Image-Page/)
+[![Huggingface space](https://img.shields.io/badge/🤗-Demo%20-yellow.svg)](https://huggingface.co/spaces/SemanticTypography/Word-As-Image)
+[![Youtube](https://img.shields.io/badge/📽️-Video%20-orchid.svg)](https://www.youtube.com/watch?v=9D12a6RCQaw)
+<br>
+<div align="center">
+    <img src="images/teaser.png" width="100%">
+</div>
+<br><br>
+A few examples of our <b>W</b>ord-<b>A</b>s-<b>I</b>mage illustrations in various fonts and for different textual concept. The semantically adjusted letters are created
+completely automatically using our method, and can then be used for further creative design as we illustrate here.<br><br>
+> Shir Iluz*, Yael Vinker*, Amir Hertz, Daniel Berio, Daniel Cohen-Or, Ariel Shamir
+> \* Denotes equal contribution
+>
+>A word-as-image is a semantic typography technique where a word illustration
+presents a visualization of the meaning of the word, while also
+preserving its readability. We present a method to create word-as-image
+illustrations automatically. This task is highly challenging as it requires
+semantic understanding of the word and a creative idea of where and how to
+depict these semantics in a visually pleasing and legible manner. We rely on
+the remarkable ability of recent large pretrained language-vision models to
+distill textual concepts visually. We target simple, concise, black-and-white
+designs that convey the semantics clearly.We deliberately do not change the
+color or texture of the letters and do not use embellishments. Our method
+optimizes the outline of each letter to convey the desired concept, guided by
+a pretrained Stable Diffusion model. We incorporate additional loss terms
+to ensure the legibility of the text and the preservation of the style of the
+font. We show high quality and engaging results on numerous examples
+and compare to alternative techniques.
+## Description
+Official implementation of Word-As-Image for Semantic Typography paper.
+<br>
+## Setup
+1. Clone the repo:
+```bash
+git clone https://github.com/WordAsImage/Word-As-Image.git
+cd Word-As-Image
+```
+2. Create a new conda environment and install the libraries:
+```bash
+conda create --name word python=3.8.15
+conda activate word
+pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+conda install -y numpy scikit-image
+conda install -y -c anaconda cmake
+conda install -y -c conda-forge ffmpeg
+pip install svgwrite svgpathtools cssutils numba torch-tools scikit-fmm easydict visdom freetype-py shapely
+pip install opencv-python==4.5.4.60
+pip install kornia==0.6.8
+pip install wandb
+pip install shapely
+```
+3. Install diffusers:
+```bash
+pip install diffusers==0.8
+pip install transformers scipy ftfy accelerate
+```
+4. Install diffvg:
+```bash
+git clone https://github.com/BachiLi/diffvg.git
+cd diffvg
+git submodule update --init --recursive
+python setup.py install
+```
+5. Paste your HuggingFace [access token](https://huggingface.co/settings/tokens) for StableDiffusion in the TOKEN file.
+## Run Experiments
+```bash
+conda activate word
+cd Word-As-Image
+# Please modify the parameters accordingly in the file and run:
+bash run_word_as_image.sh
+# Or run :
+python code/main.py --experiment <experiment> --semantic_concept <concept> --optimized_letter <letter> --seed <seed> --font <font_name> --use_wandb <0/1> --wandb_user <user name>
+```
+* ```--semantic_concept``` : the semantic concept to insert
+* ```--optimized_letter``` : one letter in the word to optimize
+* ```--font``` : font name, the <font name>.ttf file should be located in code/data/fonts/
+Optional arguments:
+* ```--word``` : The text to work on, default: the semantic concept
+* ```--config``` : Path to config file, default: code/config/base.yaml
+* ```--experiment``` : You can specify any experiment in the config file, default: conformal_0.5_dist_pixel_100_kernel201
+* ```--log_dir``` : Default: output folder
+* ```--prompt_suffix``` : Default: "minimal flat 2d vector. lineal color. trending on artstation"
+### Examples
+```bash
+python code/main.py  --semantic_concept "BUNNY" --optimized_letter "Y" --font "KaushanScript-Regular" --seed 0
+```
+<br>
+<div align="center">
+    <img src="images/KaushanScript-Regular_BUNNY_Y.svg" width="22%">
+</div>
+```bash
+python code/main.py  --semantic_concept "LEAVES" --word "NATURE" --optimized_letter "T" --font "HobeauxRococeaux-Sherman" --seed 0
+```
+<br>
+<div align="center">
+    <img src="images/HobeauxRococeaux-Sherman_NATURE_T.svg" width="25%">
+</div>
+* Pay attention, as the arguments are case-sensitive, but it can handle both upper and lowercase letters depending on the input letters.
+## Tips
+If the outcome does not meet your quality expectations, you could try the following options:
+1. Adjusting the weight 𝛼 of the L𝑎𝑐𝑎𝑝 loss, which preserves the letter's structure after deformation.
+2. Modifying the 𝜎 parameter of the low-pass filter used in the L𝑡𝑜𝑛𝑒 loss, which limits the degree of deviation from the original letter.
+3. Changing the number of control points, as this can influence the outputs.
+4. Experimenting with different seeds, as each may produce slightly different results.
+5. Changing the font type, as this can also result in various outputs.
+## Acknowledgement
+Our implementation is based ob Stable Diffusion text-to-image model from Hugging Face's [Diffusers](https://github.com/huggingface/diffusers) library, combined with [Diffvg](https://github.com/BachiLi/diffvg). The framework is built on [Live](https://github.com/Picsart-AI-Research/LIVE-Layerwise-Image-Vectorization).
+## Citation
+If you use this code for your research, please cite the following work:
+```
+@article{IluzVinker2023,
+    author = {Iluz, Shir and Vinker, Yael and Hertz, Amir and Berio, Daniel and Cohen-Or, Daniel and Shamir, Ariel},
+    title = {Word-As-Image for Semantic Typography},
+    year = {2023},
+    issue_date = {August 2023},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    volume = {42},
+    number = {4},
+    issn = {0730-0301},
+    url = {https://doi.org/10.1145/3592123},
+    doi = {10.1145/3592123},
+    journal = {ACM Trans. Graph.},
+    month = {jul},
+    articleno = {151},
+    numpages = {11},
+    keywords = {semantic typography, SVG, stable diffusion, fonts}
+}
+```
+## Licence
+This work is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-nc-sa/4.0/).

code/__init__.py ADDED Viewed

File without changes

code/bezier.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.special import binom
+from numpy.linalg import norm
+def num_bezier(n_ctrl, degree=3):
+    if type(n_ctrl) == np.ndarray:
+        n_ctrl = len(n_ctrl)
+    return int((n_ctrl - 1) / degree)
+def bernstein(n, i):
+    bi = binom(n, i)
+    return lambda t, bi=bi, n=n, i=i: bi * t**i * (1 - t)**(n - i)
+def bezier(P, t, d=0):
+    '''Bezier curve of degree len(P)-1. d is the derivative order (0 gives positions)'''
+    n = P.shape[0] - 1
+    if d > 0:
+        Q = np.diff(P, axis=0)*n
+        return bezier(Q, t, d-1)
+    B = np.vstack([bernstein(n, i)(t) for i, p in enumerate(P)])
+    return (P.T @ B).T
+def cubic_bezier(P, t):
+    return (1.0-t)**3*P[0] + 3*(1.0-t)**2*t*P[1] + 3*(1.0-t)*t**2*P[2] + t**3*P[3]
+def bezier_piecewise(Cp, subd=100, degree=3, d=0):
+    ''' sample a piecewise Bezier curve given a sequence of control points'''
+    num = num_bezier(Cp.shape[0], degree)
+    X = []
+    for i in range(num):
+        P = Cp[i*degree:i*degree+degree+1, :]
+        t = np.linspace(0, 1., subd)[:-1]
+        Y = bezier(P, t, d)
+        X += [Y]
+    X.append(Cp[-1])
+    X = np.vstack(X)
+    return X
+def compute_beziers(beziers, subd=100, degree=3):
+    chain = beziers_to_chain(beziers)
+    return bezier_piecewise(chain, subd, degree)
+def plot_control_polygon(Cp, degree=3, lw=0.5, linecolor=np.ones(3)*0.1):
+    n_bezier = num_bezier(len(Cp), degree)
+    for i in range(n_bezier):
+        cp = Cp[i*degree:i*degree+degree+1, :]
+        if degree==3:
+            plt.plot(cp[0:2,0], cp[0:2, 1], ':', color=linecolor, linewidth=lw)
+            plt.plot(cp[2:,0], cp[2:,1], ':', color=linecolor, linewidth=lw)
+            plt.plot(cp[:,0], cp[:,1], 'o', color=[0, 0.5, 1.], markersize=4)
+        else:
+            plt.plot(cp[:,0], cp[:,1], ':', color=linecolor, linewidth=lw)
+            plt.plot(cp[:,0], cp[:,1], 'o', color=[0, 0.5, 1.])
+def chain_to_beziers(chain, degree=3):
+    ''' Convert Bezier chain to list of curve segments (4 control points each)'''
+    num = num_bezier(chain.shape[0], degree)
+    beziers = []
+    for i in range(num):
+        beziers.append(chain[i*degree:i*degree+degree+1,:])
+    return beziers
+def beziers_to_chain(beziers):
+    ''' Convert list of Bezier curve segments to a piecewise bezier chain (shares vertices)'''
+    n = len(beziers)
+    chain = []
+    for i in range(n):
+        chain.append(list(beziers[i][:-1]))
+    chain.append([beziers[-1][-1]])
+    return np.array(sum(chain, []))
+def split_cubic(bez, t):
+    p1, p2, p3, p4 = bez
+    p12 = (p2 - p1) * t + p1
+    p23 = (p3 - p2) * t + p2
+    p34 = (p4 - p3) * t + p3
+    p123 = (p23 - p12) * t + p12
+    p234 = (p34 - p23) * t + p23
+    p1234 = (p234 - p123) * t + p123
+    return np.array([p1, p12, p123, p1234]), np.array([p1234, p234, p34, p4])
+def approx_arc_length(bez):
+    c0, c1, c2, c3 = bez
+    v0 = norm(c1-c0)*0.15
+    v1 = norm(-0.558983582205757*c0 + 0.325650248872424*c1 + 0.208983582205757*c2 + 0.024349751127576*c3)
+    v2 = norm(c3-c0+c2-c1)*0.26666666666666666
+    v3 = norm(-0.024349751127576*c0 - 0.208983582205757*c1 - 0.325650248872424*c2 + 0.558983582205757*c3)
+    v4 = norm(c3-c2)*.15
+    return v0 + v1 + v2 + v3 + v4
+def subdivide_bezier(bez, thresh):
+    stack = [bez]
+    res = []
+    while stack:
+        bez = stack.pop()
+        l = approx_arc_length(bez)
+        if l < thresh:
+            res.append(bez)
+        else:
+            b1, b2 = split_cubic(bez, 0.5)
+            stack += [b2, b1]
+    return res
+def subdivide_bezier_chain(C, thresh):
+    beziers = chain_to_beziers(C)
+    res = []
+    for bez in beziers:
+        res += subdivide_bezier(bez, thresh)
+    return beziers_to_chain(res)

code/config.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import argparse
+import os.path as osp
+import yaml
+import random
+from easydict import EasyDict as edict
+import numpy.random as npr
+import torch
+from utils import (
+    edict_2_dict,
+    check_and_create_dir,
+    update)
+import wandb
+import warnings
+warnings.filterwarnings("ignore")
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="code/config/base.yaml")
+    parser.add_argument("--experiment", type=str, default="conformal_0.5_dist_pixel_100_kernel201")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument('--log_dir', metavar='DIR', default="output")
+    parser.add_argument('--font', type=str, default="none", help="font name")
+    parser.add_argument('--semantic_concept', type=str, help="the semantic concept to insert")
+    parser.add_argument('--word', type=str, default="none", help="the text to work on")
+    parser.add_argument('--prompt_suffix', type=str, default="minimal flat 2d vector. lineal color."
+                                                             " trending on artstation")
+    parser.add_argument('--optimized_letter', type=str, default="none", help="the letter in the word to optimize")
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--use_wandb', type=int, default=0)
+    parser.add_argument('--wandb_user', type=str, default="none")
+    cfg = edict()
+    args = parser.parse_args()
+    with open('TOKEN', 'r') as f:
+        setattr(args, 'token', f.read().replace('\n', ''))
+    cfg.config = args.config
+    cfg.experiment = args.experiment
+    cfg.seed = args.seed
+    cfg.font = args.font
+    cfg.semantic_concept = args.semantic_concept
+    cfg.word = cfg.semantic_concept if args.word == "none" else args.word
+    if " " in cfg.word:
+      raise ValueError(f'no spaces are allowed')
+    cfg.caption = f"a {args.semantic_concept}. {args.prompt_suffix}"
+    cfg.log_dir = f"{args.log_dir}/{args.experiment}_{cfg.word}"
+    if args.optimized_letter in cfg.word:
+        cfg.optimized_letter = args.optimized_letter
+    else:
+      raise ValueError(f'letter should be in word')
+    cfg.batch_size = args.batch_size
+    cfg.token = args.token
+    cfg.use_wandb = args.use_wandb
+    cfg.wandb_user = args.wandb_user
+    cfg.letter = f"{args.font}_{args.optimized_letter}_scaled"
+    cfg.target = f"code/data/init/{cfg.letter}"
+    return cfg
+def set_config():
+    cfg_arg = parse_args()
+    with open(cfg_arg.config, 'r') as f:
+        cfg_full = yaml.load(f, Loader=yaml.FullLoader)
+    # recursively traverse parent_config pointers in the config dicts
+    cfg_key = cfg_arg.experiment
+    cfgs = [cfg_arg]
+    while cfg_key:
+        cfgs.append(cfg_full[cfg_key])
+        cfg_key = cfgs[-1].get('parent_config', 'baseline')
+    # allowing children configs to override their parents
+    cfg = edict()
+    for options in reversed(cfgs):
+        update(cfg, options)
+    del cfgs
+    # set experiment dir
+    signature = f"{cfg.letter}_concept_{cfg.semantic_concept}_seed_{cfg.seed}"
+    cfg.experiment_dir = \
+        osp.join(cfg.log_dir, cfg.font, signature)
+    configfile = osp.join(cfg.experiment_dir, 'config.yaml')
+    print('Config:', cfg)
+    # create experiment dir and save config
+    check_and_create_dir(configfile)
+    with open(osp.join(configfile), 'w') as f:
+        yaml.dump(edict_2_dict(cfg), f)
+    if cfg.use_wandb:
+        wandb.init(project="Word-As-Image", entity=cfg.wandb_user,
+                   config=cfg, name=f"{signature}", id=wandb.util.generate_id())
+    if cfg.seed is not None:
+        random.seed(cfg.seed)
+        npr.seed(cfg.seed)
+        torch.manual_seed(cfg.seed)
+        torch.backends.cudnn.benchmark = False
+    else:
+        assert False
+    return cfg

code/config/base.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+baseline:
+  parent_config: ''
+  save:
+    init: true
+    image: true
+    video: true
+    video_frame_freq: 1
+  trainable:
+    point: true
+  lr_base:
+    point: 1
+  lr:
+    lr_init: 0.002
+    lr_final: 0.0008
+    lr_delay_mult: 0.1
+    lr_delay_steps: 100
+  num_iter: 500
+  render_size: 256
+  cut_size:    350
+  level_of_cc: 0 # 0 - original number of cc / 1 - recommended / 2 - more control points
+  seed: 0
+  diffusion:
+    model: "runwayml/stable-diffusion-v1-5"
+    timesteps: 1000
+    guidance_scale: 10
+  loss:
+    use_sds_loss: true
+    tone:
+      use_tone_loss: false
+    conformal:
+      use_conformal_loss: false
+conformal_0.5_dist_pixel_100_kernel201:
+  parent_config: baseline
+  level_of_cc: 1
+  loss:
+    tone:
+      use_tone_loss: true
+      dist_loss_weight: 100
+      pixel_dist_kernel_blur: 201
+      pixel_dist_sigma: 30
+    conformal:
+      use_conformal_loss: true
+      angeles_w: 0.5

code/data/fonts/Bell MT.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:986a5b8bb70238e3c896e3113ef581df26204131f72d59fc12d2deef7ef89e4c
+size 84840

code/data/fonts/DeliusUnicase-Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05e56564fea31b721f49d24e6c4d7787f89ccad060a554d97ba132bd1f0e0f58
+size 31504

code/data/fonts/HobeauxRococeaux-Sherman.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ce6fa2f04d1009b45231d774ca53a2d2927b0cf60520845591214023e5dc7a0
+size 117452

code/data/fonts/IndieFlower-Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6a139213baa54b2ff51d300f98cc1c16d690b5046a4b0e42435cbf791767853
+size 55416

code/data/fonts/JosefinSans-Light.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90e56e9991974e7aa9efc7bec0e8916df5711c02370fab5a8560d481a4ed86c9
+size 59308

code/data/fonts/KaushanScript-Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4e5d5ae35aeef6d2a1f8ba99bad6d716cab67eb9a4cbf349b670008e2c086f5
+size 183972

code/data/fonts/LuckiestGuy-Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbe683091b0db0faa8f38e5227ad3bc17dc67f119f27ff10a63d72f5f9bb9da3
+size 58324

code/data/fonts/Noteworthy-Bold.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a6aac67ae5f5e0b98cd49db21fe675f60ad8cd7771a86500d000da515d980d
+size 248052

code/data/fonts/Quicksand.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06927fae113c34dca0a33a2bc522da1f4cccc6dda735858090dcf48b0f280535
+size 124196

code/data/fonts/Saira-Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1216b735c5fcfeffb511750b8c3cb78ee447bda24c51d58ef50055f2f7d0dd4d
+size 82764

code/data/init/KaushanScript-Regular_B.svg ADDED Viewed

code/data/init/KaushanScript-Regular_BUNNY.svg ADDED Viewed

code/data/init/KaushanScript-Regular_BUNNY_scaled.svg ADDED Viewed

code/data/init/KaushanScript-Regular_B_scaled.svg ADDED Viewed

code/data/init/KaushanScript-Regular_N.svg ADDED Viewed

code/data/init/KaushanScript-Regular_N_scaled.svg ADDED Viewed

code/data/init/KaushanScript-Regular_U.svg ADDED Viewed

code/data/init/KaushanScript-Regular_U_scaled.svg ADDED Viewed

code/data/init/KaushanScript-Regular_Y.svg ADDED Viewed

code/data/init/KaushanScript-Regular_Y_scaled.svg ADDED Viewed

code/generate.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# code/generate.py
+import subprocess, json, tempfile, os, pathlib, torch
+REPO = pathlib.Path(__file__).resolve().parents[1]
+def generate_word_image(cfg, device):
+    """
+    cfg   : plain dict or EasyDict
+    device: torch.device  (ignored here; CLI handles CUDA)
+    Returns absolute path of the rendered PNG.
+    """
+    with tempfile.TemporaryDirectory() as tmp:
+        cfg_path = pathlib.Path(tmp) / "cfg.json"
+        with open(cfg_path, "w") as f:
+            json.dump(cfg, f)
+        # Call the original CLI exactly like your bash script
+        cmd = [
+            "python", os.fspath(REPO / "code" / "main.py"),
+            "--config", os.fspath(cfg_path),
+        ]
+        subprocess.check_call(cmd)
+        # main.py saves into cfg['log_dir']/…/final.png – read it back
+        out_png = next((REPO / cfg["log_dir"]).rglob("*.png"))
+        return os.fspath(out_png)

code/losses.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch.nn as nn
+import torchvision
+from scipy.spatial import Delaunay
+import torch
+import numpy as np
+from torch.nn import functional as nnf
+from easydict import EasyDict
+from shapely.geometry import Point
+from shapely.geometry.polygon import Polygon
+from diffusers import StableDiffusionPipeline
+class SDSLoss(nn.Module):
+    def __init__(self, cfg, device):
+        super(SDSLoss, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.pipe = StableDiffusionPipeline.from_pretrained(
+            cfg.diffusion.model,
+            torch_dtype=torch.float16,
+            token=cfg.token,
+        )
+        self.pipe.enable_xformers_memory_efficient_attention()
+        self.pipe.enable_attention_slicing(slice_size=1)
+        self.pipe.enable_vae_slicing()
+        self.pipe.enable_vae_tiling()
+        self.pipe.unet.enable_gradient_checkpointing()
+        alphas_cumprod = torch.tensor(self.pipe.scheduler.alphas_cumprod)
+        self.alphas = alphas_cumprod.to(device)
+        self.sigmas = torch.sqrt(1 - self.alphas)
+        # 1️⃣  embed text while all weights are still real tensors
+        self.embed_text()
+        # 2️⃣  NOW turn on off-loading (only UNet & VAE get meta tensors)
+        self.pipe.enable_model_cpu_offload()
+        # text-encoder is no longer needed
+        del self.pipe.text_encoder, self.pipe.tokenizer
+    def embed_text(self):
+        tok = self.pipe.tokenizer
+        txt   = tok(self.cfg.caption, padding="max_length",
+                    max_length=tok.model_max_length,
+                    truncation=True, return_tensors="pt")
+        un    = tok([""], padding="max_length",
+                    max_length=tok.model_max_length,
+                    return_tensors="pt")
+        with torch.no_grad():
+            te = self.pipe.text_encoder.eval()          # still real tensors
+            em_txt = te(txt.input_ids   ).last_hidden_state.to(torch.float16)
+            em_un  = te(un .input_ids   ).last_hidden_state.to(torch.float16)
+        self.text_embeddings = (
+            torch.cat([em_un, em_txt])
+            .repeat_interleave(self.cfg.batch_size, 0)
+            .to(self.device)
+        )
+    def forward(self, x_aug: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------- encode
+        x = (x_aug * 2.0 - 1.0).to(self.device, dtype=torch.float16)
+        with torch.cuda.amp.autocast():
+            latents = self.pipe.vae.encode(x).latent_dist.sample()
+        latents = 0.18215 * latents.to(self.device, dtype=torch.float16)
+        torch.cuda.empty_cache()
+        # ---------------------------------------------------- add noise
+        t = torch.randint(
+            50,
+            min(950, self.cfg.diffusion.timesteps) - 1,
+            (latents.size(0),),
+            device=self.device,
+        )
+        eps   = torch.randn_like(latents)
+        z_t   = self.pipe.scheduler.add_noise(latents, eps, t)
+        # ---------------------------------------------------- sequential CFG
+        emb_u, emb_c = self.text_embeddings.chunk(2)
+        with torch.cuda.amp.autocast():
+            eps_u = self.pipe.unet(z_t, t, encoder_hidden_states=emb_u).sample
+        torch.cuda.empty_cache()                       # release ~500 MB
+        with torch.cuda.amp.autocast():
+            eps_c = self.pipe.unet(z_t, t, encoder_hidden_states=emb_c).sample
+        # UNet already ran in fp16 under autocast – avoid duplicating tensors
+        eps_t = eps_u + self.cfg.diffusion.guidance_scale * (eps_c - eps_u)
+        # ---------------------------------------------------- SDS grad & loss
+        alpha_t = self.alphas[t].to(self.device)
+        sigma_t = self.sigmas[t].to(self.device)
+        grad = (alpha_t**0.5 * sigma_t * (eps_t - eps)).nan_to_num_()
+        return (grad * latents).sum(1).mean()
+class ToneLoss(nn.Module):
+    def __init__(self, cfg):
+        super(ToneLoss, self).__init__()
+        self.dist_loss_weight = cfg.loss.tone.dist_loss_weight
+        self.im_init = None
+        self.cfg = cfg
+        self.mse_loss = nn.MSELoss()
+        self.blurrer = torchvision.transforms.GaussianBlur(kernel_size=(cfg.loss.tone.pixel_dist_kernel_blur,
+                                                                        cfg.loss.tone.pixel_dist_kernel_blur), sigma=(cfg.loss.tone.pixel_dist_sigma))
+    def set_image_init(self, im_init):
+        self.im_init = im_init.permute(2, 0, 1).unsqueeze(0)
+        self.init_blurred = self.blurrer(self.im_init)
+    def get_scheduler(self, step=None):
+        if step is not None:
+            return self.dist_loss_weight * np.exp(-(1/5)*((step-300)/(20)) ** 2)
+        else:
+            return self.dist_loss_weight
+    def forward(self, cur_raster, step=None):
+        blurred_cur = self.blurrer(cur_raster)
+        return self.mse_loss(self.init_blurred.detach(), blurred_cur) * self.get_scheduler(step)
+class ConformalLoss:
+    def __init__(self, parameters: EasyDict, device: torch.device, target_letter: str, shape_groups):
+        self.parameters = parameters
+        self.target_letter = target_letter
+        self.shape_groups = shape_groups
+        self.faces = self.init_faces(device)
+        self.faces_roll_a = [torch.roll(self.faces[i], 1, 1) for i in range(len(self.faces))]
+        with torch.no_grad():
+            self.angles = []
+            self.reset()
+    def get_angles(self, points: torch.Tensor) -> torch.Tensor:
+        angles_ = []
+        for i in range(len(self.faces)):
+            triangles = points[self.faces[i]]
+            triangles_roll_a = points[self.faces_roll_a[i]]
+            edges = triangles_roll_a - triangles
+            length = edges.norm(dim=-1)
+            edges = edges / (length + 1e-1)[:, :, None]
+            edges_roll = torch.roll(edges, 1, 1)
+            cosine = torch.einsum('ned,ned->ne', edges, edges_roll)
+            angles = torch.arccos(cosine)
+            angles_.append(angles)
+        return angles_
+    def get_letter_inds(self, letter_to_insert):
+        for group, l in zip(self.shape_groups, self.target_letter):
+            if l == letter_to_insert:
+                letter_inds = group.shape_ids
+                return letter_inds[0], letter_inds[-1], len(letter_inds)
+    def reset(self):
+        points = torch.cat([point.clone().detach() for point in self.parameters.point]).to(self.faces[0].device)
+        self.angles = self.get_angles(points)
+    def init_faces(self, device: torch.device) -> torch.tensor:
+        faces_ = []
+        for j, c in enumerate(self.target_letter):
+            points_np = [self.parameters.point[i].clone().detach().cpu().numpy() for i in range(len(self.parameters.point))]
+            start_ind, end_ind, shapes_per_letter = self.get_letter_inds(c)
+            print(c, start_ind, end_ind)
+            holes = []
+            if shapes_per_letter > 1:
+                holes = points_np[start_ind+1:end_ind]
+            poly = Polygon(points_np[start_ind], holes=holes)
+            poly = poly.buffer(0)
+            points_np = np.concatenate(points_np)
+            faces = Delaunay(points_np).simplices
+            is_intersect = np.array([poly.contains(Point(points_np[face].mean(0))) for face in faces], dtype=bool)
+            faces_.append(torch.from_numpy(faces[is_intersect]).to(device, dtype=torch.int64))
+        return faces_
+    def __call__(self) -> torch.Tensor:
+        loss_angles = 0
+        points = torch.cat(self.parameters.point).to(self.faces[0].device)
+        angles = self.get_angles(points)
+        for i in range(len(self.faces)):
+            loss_angles += (nnf.mse_loss(angles[i], self.angles[i]))
+        return loss_angles

code/main.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from typing import Mapping
+import os
+import base64  # still here if you need it later
+from tqdm import tqdm
+from easydict import EasyDict as edict
+import matplotlib.pyplot as plt
+import torch
+from torch.optim.lr_scheduler import LambdaLR
+import pydiffvg
+import save_svg
+from losses import SDSLoss, ToneLoss, ConformalLoss
+from config import set_config
+from utils import (
+    check_and_create_dir,
+    get_data_augs,
+    save_image,
+    preprocess,
+    learning_rate_decay,
+    combine_word,
+    create_video,
+)
+import wandb
+import warnings
+warnings.filterwarnings("ignore")
+pydiffvg.set_print_timing(False)
+gamma = 1.0
+def init_shapes(svg_path: str, trainable: Mapping[str, bool]):
+    """Load the initial SVG, mark trainable points, return shapes & params."""
+    svg = f"{svg_path}.svg"
+    _, _, shapes_init, shape_groups_init = pydiffvg.svg_to_scene(svg)
+    parameters = edict()
+    if trainable.point:
+        parameters.point = []
+        for path in shapes_init:
+            path.points.requires_grad = True
+            parameters.point.append(path.points)
+    return shapes_init, shape_groups_init, parameters
+# -----------------------------------------------------------------------------
+#  Public entry‑point that CLI *and* FastAPI reuse
+# -----------------------------------------------------------------------------
+def generate_word_image(cfg, device: torch.device):
+    """Optimise a single word and return path to the resulting PNG."""
+    # make sure we can access attributes no matter if `cfg` is dict or EasyDict
+    if isinstance(cfg, dict):
+        cfg = edict(cfg)
+    pydiffvg.set_use_gpu(device.type == "cuda")
+    print("preprocessing")
+    preprocess(cfg.font, cfg.word, cfg.optimized_letter, cfg.level_of_cc)
+    if cfg.loss.use_sds_loss:
+        sds_loss = SDSLoss(cfg, device)
+    h = w = cfg.render_size
+    data_augs = get_data_augs(cfg.cut_size)
+    render = pydiffvg.RenderFunction.apply
+    print("initializing shape")
+    shapes, shape_groups, parameters = init_shapes(svg_path=cfg.target, trainable=cfg.trainable)
+    scene_args = pydiffvg.RenderFunction.serialize_scene(w, h, shapes, shape_groups)
+    img_init = render(w, h, 2, 2, 0, None, *scene_args)
+    img_init = img_init[:, :, 3:4] * img_init[:, :, :3] + torch.ones_like(img_init[:, :, :3]) * (1 - img_init[:, :, 3:4])
+    img_init = img_init[:, :, :3]
+    if cfg.use_wandb:
+        plt.imshow(img_init.detach().cpu())
+        wandb.log({"init": wandb.Image(plt)}, step=0)
+        plt.close()
+    if cfg.loss.tone.use_tone_loss:
+        tone_loss = ToneLoss(cfg)
+        tone_loss.set_image_init(img_init)
+    if cfg.save.init:
+        print("saving init")
+        filename = os.path.join(cfg.experiment_dir, "svg-init", "init.svg")
+        check_and_create_dir(filename)
+        save_svg.save_svg(filename, w, h, shapes, shape_groups)
+    num_iter = cfg.num_iter
+    optim = torch.optim.Adam([
+        {"params": parameters["point"], "lr": cfg.lr_base["point"]}
+    ], betas=(0.9, 0.9), eps=1e-6)
+    if cfg.loss.conformal.use_conformal_loss:
+        conformal_loss = ConformalLoss(parameters, device, cfg.optimized_letter, shape_groups)
+    lr_lambda = lambda step: learning_rate_decay(
+        step,
+        cfg.lr.lr_init,
+        cfg.lr.lr_final,
+        num_iter,
+        lr_delay_steps=cfg.lr.lr_delay_steps,
+        lr_delay_mult=cfg.lr.lr_delay_mult,
+    ) / cfg.lr.lr_init
+    scheduler = LambdaLR(optim, lr_lambda=lr_lambda, last_epoch=-1)
+    print("start training")
+    for step in tqdm(range(num_iter)):
+        if cfg.use_wandb:
+            wandb.log({"learning_rate": optim.param_groups[0]["lr"]}, step=step)
+        optim.zero_grad()
+        scene_args = pydiffvg.RenderFunction.serialize_scene(w, h, shapes, shape_groups)
+        img = render(w, h, 2, 2, step, None, *scene_args)
+        img = img[:, :, 3:4] * img[:, :, :3] + torch.ones_like(img[:, :, :3]) * (1 - img[:, :, 3:4])
+        img = img[:, :, :3]
+        if cfg.save.video and (step % cfg.save.video_frame_freq == 0 or step == num_iter - 1):
+            save_image(img, os.path.join(cfg.experiment_dir, "video-png", f"iter{step:04d}.png"), gamma)
+            svg_frame = os.path.join(cfg.experiment_dir, "video-svg", f"iter{step:04d}.svg")
+            check_and_create_dir(svg_frame)
+            save_svg.save_svg(svg_frame, w, h, shapes, shape_groups)
+            if cfg.use_wandb:
+                plt.imshow(img.detach().cpu())
+                wandb.log({"img": wandb.Image(plt)}, step=step)
+                plt.close()
+        x = img.unsqueeze(0).permute(0, 3, 1, 2).repeat(cfg.batch_size, 1, 1, 1)
+        x_aug = data_augs.forward(x)
+        loss = sds_loss(x_aug)
+        if cfg.loss.tone.use_tone_loss:
+            loss = loss + tone_loss(x, step)
+        if cfg.loss.conformal.use_conformal_loss:
+            loss = loss + cfg.loss.conformal.angeles_w * conformal_loss()
+        if cfg.use_wandb:
+            wandb.log({"total_loss": loss.item()}, step=step)
+        loss.backward()
+        optim.step()
+        scheduler.step()
+    svg_out = os.path.join(cfg.experiment_dir, "output-svg", "output.svg")
+    check_and_create_dir(svg_out)
+    save_svg.save_svg(svg_out, w, h, shapes, shape_groups)
+    combine_word(cfg.word, cfg.optimized_letter, cfg.font, cfg.experiment_dir)
+    if cfg.save.image:
+        png_out = os.path.join(cfg.experiment_dir, "output-png", "output.png")
+        check_and_create_dir(png_out)
+        pydiffvg.imwrite(img.detach().cpu(), png_out, gamma=gamma)
+        if cfg.use_wandb:
+            plt.imshow(img.detach().cpu())
+            wandb.log({"img": wandb.Image(plt)}, step=num_iter)
+            plt.close()
+    else:
+        png_out = ""
+    if cfg.save.video:
+        print("saving video")
+        create_video(cfg.num_iter, cfg.experiment_dir, cfg.save.video_frame_freq)
+    if cfg.use_wandb:
+        wandb.finish()
+    return os.path.abspath(png_out)
+# -----------------------------------------------------------------------------
+#  CLI entry‑point – original behaviour when run directly
+# -----------------------------------------------------------------------------
+def cli_entry():
+    cfg = set_config()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    generate_word_image(cfg, device)
+if __name__ == "__main__":
+    cli_entry()

code/save_svg.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import torch
+import pydiffvg
+import xml.etree.ElementTree as etree
+from xml.dom import minidom
+def prettify(elem):
+    """Return a pretty-printed XML string for the Element.
+    """
+    rough_string = etree.tostring(elem, 'utf-8')
+    reparsed = minidom.parseString(rough_string)
+    return reparsed.toprettyxml(indent="  ")
+def save_svg(filename, width, height, shapes, shape_groups, use_gamma = False, background=None):
+    root = etree.Element('svg')
+    root.set('version', '1.1')
+    root.set('xmlns', 'http://www.w3.org/2000/svg')
+    root.set('width', str(width))
+    root.set('height', str(height))
+    if background is not None:
+        print(f"setting background to {background}")
+        root.set('style', str(background))
+    defs = etree.SubElement(root, 'defs')
+    g = etree.SubElement(root, 'g')
+    if use_gamma:
+        f = etree.SubElement(defs, 'filter')
+        f.set('id', 'gamma')
+        f.set('x', '0')
+        f.set('y', '0')
+        f.set('width', '100%')
+        f.set('height', '100%')
+        gamma = etree.SubElement(f, 'feComponentTransfer')
+        gamma.set('color-interpolation-filters', 'sRGB')
+        feFuncR = etree.SubElement(gamma, 'feFuncR')
+        feFuncR.set('type', 'gamma')
+        feFuncR.set('amplitude', str(1))
+        feFuncR.set('exponent', str(1/2.2))
+        feFuncG = etree.SubElement(gamma, 'feFuncG')
+        feFuncG.set('type', 'gamma')
+        feFuncG.set('amplitude', str(1))
+        feFuncG.set('exponent', str(1/2.2))
+        feFuncB = etree.SubElement(gamma, 'feFuncB')
+        feFuncB.set('type', 'gamma')
+        feFuncB.set('amplitude', str(1))
+        feFuncB.set('exponent', str(1/2.2))
+        feFuncA = etree.SubElement(gamma, 'feFuncA')
+        feFuncA.set('type', 'gamma')
+        feFuncA.set('amplitude', str(1))
+        feFuncA.set('exponent', str(1/2.2))
+        g.set('style', 'filter:url(#gamma)')
+    # Store color
+    for i, shape_group in enumerate(shape_groups):
+        def add_color(shape_color, name):
+            if isinstance(shape_color, pydiffvg.LinearGradient):
+                lg = shape_color
+                color = etree.SubElement(defs, 'linearGradient')
+                color.set('id', name)
+                color.set('x1', str(lg.begin[0].item()/width))
+                color.set('y1', str(lg.begin[1].item()/height))
+                color.set('x2', str(lg.end[0].item()/width))
+                color.set('y2', str(lg.end[1].item()/height))
+                offsets = lg.offsets.data.cpu().numpy()
+                stop_colors = lg.stop_colors.data.cpu().numpy()
+                for j in range(offsets.shape[0]):
+                    stop = etree.SubElement(color, 'stop')
+                    stop.set('offset', str(offsets[j]))
+                    c = lg.stop_colors[j, :]
+                    stop.set('stop-color', 'rgb({}, {}, {})'.format(\
+                        int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                    stop.set('stop-opacity', '{}'.format(c[3]))
+            if isinstance(shape_color, pydiffvg.RadialGradient):
+                lg = shape_color
+                color = etree.SubElement(defs, 'radialGradient')
+                color.set('id', name)
+                color.set('cx', str(lg.center[0].item()/width))
+                color.set('cy', str(lg.center[1].item()/height))
+                # this only support width=height
+                color.set('r', str(lg.radius[0].item()/width))
+                offsets = lg.offsets.data.cpu().numpy()
+                stop_colors = lg.stop_colors.data.cpu().numpy()
+                for j in range(offsets.shape[0]):
+                    stop = etree.SubElement(color, 'stop')
+                    stop.set('offset', str(offsets[j]))
+                    c = lg.stop_colors[j, :]
+                    stop.set('stop-color', 'rgb({}, {}, {})'.format(\
+                        int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                    stop.set('stop-opacity', '{}'.format(c[3]))
+        if shape_group.fill_color is not None:
+            add_color(shape_group.fill_color, 'shape_{}_fill'.format(i))
+        if shape_group.stroke_color is not None:
+            add_color(shape_group.stroke_color, 'shape_{}_stroke'.format(i))
+    for i, shape_group in enumerate(shape_groups):
+        # shape = shapes[shape_group.shape_ids[0]]
+        for j,id in enumerate(shape_group.shape_ids):
+            shape = shapes[id]
+            if isinstance(shape, pydiffvg.Path):
+                if j == 0:
+                    shape_node = etree.SubElement(g, 'path')
+                    path_str = ''
+                # shape_node = etree.SubElement(g, 'path')
+                num_segments = shape.num_control_points.shape[0]
+                num_control_points = shape.num_control_points.data.cpu().numpy()
+                points = shape.points.data.cpu().numpy()
+                num_points = shape.points.shape[0]
+                path_str += 'M {} {}'.format(points[0, 0], points[0, 1])
+                point_id = 1
+                for j in range(0, num_segments):
+                    if num_control_points[j] == 0:
+                        p = point_id % num_points
+                        path_str += ' L {} {}'.format(\
+                                points[p, 0], points[p, 1])
+                        point_id += 1
+                    elif num_control_points[j] == 1:
+                        p1 = (point_id + 1) % num_points
+                        path_str += ' Q {} {} {} {}'.format(\
+                                points[point_id, 0], points[point_id, 1],
+                                points[p1, 0], points[p1, 1])
+                        point_id += 2
+                    elif num_control_points[j] == 2:
+                        p2 = (point_id + 2) % num_points
+                        path_str += ' C {} {} {} {} {} {}'.format(\
+                                points[point_id, 0], points[point_id, 1],
+                                points[point_id + 1, 0], points[point_id + 1, 1],
+                                points[p2, 0], points[p2, 1])
+                        point_id += 3
+            else:
+                assert(False)
+            # shape_node.set('stroke-width', str(2 * shape.stroke_width.data.cpu().item()))
+            shape_node.set('stroke-width', str(0)) # no strokes
+            if shape_group.fill_color is not None:
+                if isinstance(shape_group.fill_color, pydiffvg.LinearGradient):
+                    shape_node.set('fill', 'url(#shape_{}_fill)'.format(i))
+                elif isinstance(shape_group.fill_color, pydiffvg.RadialGradient):
+                    shape_node.set('fill', 'url(#shape_{}_fill)'.format(i))
+                else:
+                    c = shape_group.fill_color.data.cpu().numpy()
+                    shape_node.set('fill', 'rgb({}, {}, {})'.format(\
+                        int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                    shape_node.set('opacity', str(c[3]))
+            else:
+                shape_node.set('fill', 'none')
+            if shape_group.stroke_color is not None:
+                if isinstance(shape_group.stroke_color, pydiffvg.LinearGradient):
+                    shape_node.set('stroke', 'url(#shape_{}_stroke)'.format(i))
+                elif isinstance(shape_group.stroke_color, pydiffvg.LinearGradient):
+                    shape_node.set('stroke', 'url(#shape_{}_stroke)'.format(i))
+                else:
+                    c = shape_group.stroke_color.data.cpu().numpy()
+                    shape_node.set('stroke', 'rgb({}, {}, {})'.format(\
+                        int(255 * c[0]), int(255 * c[1]), int(255 * c[2])))
+                    shape_node.set('stroke-opacity', str(c[3]))
+                shape_node.set('stroke-linecap', 'round')
+                shape_node.set('stroke-linejoin', 'round')
+        shape_node.set('d', path_str)
+    with open(filename, "w") as f:
+        f.write(prettify(root))

code/ttf.py ADDED Viewed

	@@ -0,0 +1,265 @@

+from importlib import reload
+import os
+import numpy as np
+import bezier
+import freetype as ft
+import pydiffvg
+import torch
+import save_svg
+device = torch.device("cuda" if (
+        torch.cuda.is_available() and torch.cuda.device_count() > 0) else "cpu")
+reload(bezier)
+def fix_single_svg(svg_path, all_word=False):
+    target_h_letter = 360
+    target_canvas_width, target_canvas_height = 600, 600
+    canvas_width, canvas_height, shapes, shape_groups = pydiffvg.svg_to_scene(svg_path)
+    letter_h = canvas_height
+    letter_w = canvas_width
+    if all_word:
+        if letter_w > letter_h:
+            scale_canvas_w = target_h_letter / letter_w
+            hsize = int(letter_h * scale_canvas_w)
+            scale_canvas_h = hsize / letter_h
+        else:
+            scale_canvas_h = target_h_letter / letter_h
+            wsize = int(letter_w * scale_canvas_h)
+            scale_canvas_w = wsize / letter_w
+    else:
+        scale_canvas_h = target_h_letter / letter_h
+        wsize = int(letter_w * scale_canvas_h)
+        scale_canvas_w = wsize / letter_w
+    for num, p in enumerate(shapes):
+        p.points[:, 0] = p.points[:, 0] * scale_canvas_w
+        p.points[:, 1] = p.points[:, 1] * scale_canvas_h + target_h_letter
+    w_min, w_max = min([torch.min(p.points[:, 0]) for p in shapes]), max([torch.max(p.points[:, 0]) for p in shapes])
+    h_min, h_max = min([torch.min(p.points[:, 1]) for p in shapes]), max([torch.max(p.points[:, 1]) for p in shapes])
+    for num, p in enumerate(shapes):
+        p.points[:, 0] = p.points[:, 0] + target_canvas_width/2 - int(w_min + (w_max - w_min) / 2)
+        p.points[:, 1] = p.points[:, 1] + target_canvas_height/2 - int(h_min + (h_max - h_min) / 2)
+    output_path = f"{svg_path[:-4]}_scaled.svg"
+    save_svg.save_svg(output_path, target_canvas_width, target_canvas_height, shapes, shape_groups)
+def normalize_letter_size(dest_path, font, txt):
+    fontname = os.path.splitext(os.path.basename(font))[0]
+    for i, c in enumerate(txt):
+        fname = f"{dest_path}/{fontname}_{c}.svg"
+        fname = fname.replace(" ", "_")
+        fix_single_svg(fname)
+    fname = f"{dest_path}/{fontname}_{txt}.svg"
+    fname = fname.replace(" ", "_")
+    fix_single_svg(fname, all_word=True)
+def glyph_to_cubics(face, x=0):
+    ''' Convert current font face glyph to cubic beziers'''
+    def linear_to_cubic(Q):
+        a, b = Q
+        return [a + (b - a) * t for t in np.linspace(0, 1, 4)]
+    def quadratic_to_cubic(Q):
+        return [Q[0],
+                Q[0] + (2 / 3) * (Q[1] - Q[0]),
+                Q[2] + (2 / 3) * (Q[1] - Q[2]),
+                Q[2]]
+    beziers = []
+    pt = lambda p: np.array([p.x + x, -p.y])  # Flipping here since freetype has y-up
+    last = lambda: beziers[-1][-1]
+    def move_to(a, beziers):
+        beziers.append([pt(a)])
+    def line_to(a, beziers):
+        Q = linear_to_cubic([last(), pt(a)])
+        beziers[-1] += Q[1:]
+    def conic_to(a, b, beziers):
+        Q = quadratic_to_cubic([last(), pt(a), pt(b)])
+        beziers[-1] += Q[1:]
+    def cubic_to(a, b, c, beziers):
+        beziers[-1] += [pt(a), pt(b), pt(c)]
+    face.glyph.outline.decompose(beziers, move_to=move_to, line_to=line_to, conic_to=conic_to, cubic_to=cubic_to)
+    beziers = [np.array(C).astype(float) for C in beziers]
+    return beziers
+def font_string_to_beziers(font, txt, size=30, spacing=1.0, merge=True, target_control=None):
+    ''' Load a font and convert the outlines for a given string to cubic bezier curves,
+        if merge is True, simply return a list of all bezier curves,
+        otherwise return a list of lists with the bezier curves for each glyph'''
+    face = ft.Face(font)
+    face.set_char_size(64 * size)
+    slot = face.glyph
+    x = 0
+    beziers = []
+    previous = 0
+    for c in txt:
+        face.load_char(c, ft.FT_LOAD_DEFAULT | ft.FT_LOAD_NO_BITMAP)
+        bez = glyph_to_cubics(face, x)
+        # Check number of control points if desired
+        if target_control is not None:
+            if c in target_control.keys():
+                nctrl = np.sum([len(C) for C in bez])
+                while nctrl < target_control[c]:
+                    longest = np.max(
+                        sum([[bezier.approx_arc_length(b) for b in bezier.chain_to_beziers(C)] for C in bez], []))
+                    thresh = longest * 0.5
+                    bez = [bezier.subdivide_bezier_chain(C, thresh) for C in bez]
+                    nctrl = np.sum([len(C) for C in bez])
+                    print(nctrl)
+        if merge:
+            beziers += bez
+        else:
+            beziers.append(bez)
+        kerning = face.get_kerning(previous, c)
+        x += (slot.advance.x + kerning.x) * spacing
+        previous = c
+    return beziers
+def bezier_chain_to_commands(C, closed=True):
+    curves = bezier.chain_to_beziers(C)
+    cmds = 'M %f %f ' % (C[0][0], C[0][1])
+    n = len(curves)
+    for i, bez in enumerate(curves):
+        if i == n - 1 and closed:
+            cmds += 'C %f %f %f %f %f %fz ' % (*bez[1], *bez[2], *bez[3])
+        else:
+            cmds += 'C %f %f %f %f %f %f ' % (*bez[1], *bez[2], *bez[3])
+    return cmds
+def count_cp(file_name, font_name):
+    canvas_width, canvas_height, shapes, shape_groups = pydiffvg.svg_to_scene(file_name)
+    p_counter = 0
+    for path in shapes:
+        p_counter += path.points.shape[0]
+    print(f"TOTAL CP:   [{p_counter}]")
+    return p_counter
+def write_letter_svg(c, header, fontname, beziers, subdivision_thresh, dest_path):
+    cmds = ''
+    svg = header
+    path = '<g><path d="'
+    for C in beziers:
+        if subdivision_thresh is not None:
+            print('subd')
+            C = bezier.subdivide_bezier_chain(C, subdivision_thresh)
+        cmds += bezier_chain_to_commands(C, True)
+    path += cmds + '"/>\n'
+    svg += path + '</g></svg>\n'
+    fname = f"{dest_path}/{fontname}_{c}.svg"
+    fname = fname.replace(" ", "_")
+    f = open(fname, 'w')
+    f.write(svg)
+    f.close()
+    return fname, path
+def font_string_to_svgs(dest_path, font, txt, size=30, spacing=1.0, target_control=None, subdivision_thresh=None):
+    fontname = os.path.splitext(os.path.basename(font))[0]
+    glyph_beziers = font_string_to_beziers(font, txt, size, spacing, merge=False, target_control=target_control)
+    if not os.path.isdir(dest_path):
+        os.mkdir(dest_path)
+    # Compute boundig box
+    points = np.vstack(sum(glyph_beziers, []))
+    lt = np.min(points, axis=0)
+    rb = np.max(points, axis=0)
+    size = rb - lt
+    sizestr = 'width="%.1f" height="%.1f"' % (size[0], size[1])
+    boxstr = ' viewBox="%.1f %.1f %.1f %.1f"' % (lt[0], lt[1], size[0], size[1])
+    header = '''<?xml version="1.0" encoding="utf-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" baseProfile="full" '''
+    header += sizestr
+    header += boxstr
+    header += '>\n<defs/>\n'
+    svg_all = header
+    for i, (c, beziers) in enumerate(zip(txt, glyph_beziers)):
+        print(f"==== {c} ====")
+        fname, path = write_letter_svg(c, header, fontname, beziers, subdivision_thresh, dest_path)
+        num_cp = count_cp(fname, fontname)
+        print(num_cp)
+        print(font, c)
+        # Add to global svg
+        svg_all += path + '</g>\n'
+    # Save global svg
+    svg_all += '</svg>\n'
+    fname = f"{dest_path}/{fontname}_{txt}.svg"
+    fname = fname.replace(" ", "_")
+    f = open(fname, 'w')
+    f.write(svg_all)
+    f.close()
+if __name__ == '__main__':
+    fonts = ["KaushanScript-Regular"]
+    level_of_cc = 1
+    if level_of_cc == 0:
+        target_cp = None
+    else:
+        target_cp = {"A": 120, "B": 120, "C": 100, "D": 100,
+                     "E": 120, "F": 120, "G": 120, "H": 120,
+                     "I": 35, "J": 80, "K": 100, "L": 80,
+                     "M": 100, "N": 100, "O": 100, "P": 120,
+                     "Q": 120, "R": 130, "S": 110, "T": 90,
+                     "U": 100, "V": 100, "W": 100, "X": 130,
+                     "Y": 120, "Z": 120,
+                     "a": 120, "b": 120, "c": 100, "d": 100,
+                     "e": 120, "f": 120, "g": 120, "h": 120,
+                     "i": 35, "j": 80, "k": 100, "l": 80,
+                     "m": 100, "n": 100, "o": 100, "p": 120,
+                     "q": 120, "r": 130, "s": 110, "t": 90,
+                     "u": 100, "v": 100, "w": 100, "x": 130,
+                     "y": 120, "z": 120
+                     }
+        target_cp = {k: v * level_of_cc for k, v in target_cp.items()}
+    for f in fonts:
+        print(f"======= {f} =======")
+        font_path = f"data/fonts/{f}.ttf"
+        output_path = f"data/init"
+        txt = "BUNNY"
+        subdivision_thresh = None
+        font_string_to_svgs(output_path, font_path, txt, target_control=target_cp,
+                            subdivision_thresh=subdivision_thresh)
+        normalize_letter_size(output_path, font_path, txt)
+        print("DONE")

code/utils.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import collections.abc
+import os
+import os.path as osp
+from torch import nn
+import kornia.augmentation as K
+import pydiffvg
+import save_svg
+import cv2
+from ttf import font_string_to_svgs, normalize_letter_size
+import torch
+import numpy as np
+def edict_2_dict(x):
+    if isinstance(x, dict):
+        xnew = {}
+        for k in x:
+            xnew[k] = edict_2_dict(x[k])
+        return xnew
+    elif isinstance(x, list):
+        xnew = []
+        for i in range(len(x)):
+            xnew.append( edict_2_dict(x[i]))
+        return xnew
+    else:
+        return x
+def check_and_create_dir(path):
+    pathdir = osp.split(path)[0]
+    if osp.isdir(pathdir):
+        pass
+    else:
+        os.makedirs(pathdir)
+def update(d, u):
+    """https://stackoverflow.com/questions/3232943/update-value-of-a-nested-dictionary-of-varying-depth"""
+    for k, v in u.items():
+        if isinstance(v, collections.abc.Mapping):
+            d[k] = update(d.get(k, {}), v)
+        else:
+            d[k] = v
+    return d
+def preprocess(font, word, letter, level_of_cc=1):
+    if level_of_cc == 0:
+        target_cp = None
+    else:
+        target_cp = {"A": 120, "B": 120, "C": 100, "D": 100,
+                     "E": 120, "F": 120, "G": 120, "H": 120,
+                     "I": 35, "J": 80, "K": 100, "L": 80,
+                     "M": 100, "N": 100, "O": 100, "P": 120,
+                     "Q": 120, "R": 130, "S": 110, "T": 90,
+                     "U": 100, "V": 100, "W": 100, "X": 130,
+                     "Y": 120, "Z": 120,
+                     "a": 120, "b": 120, "c": 100, "d": 100,
+                     "e": 120, "f": 120, "g": 120, "h": 120,
+                     "i": 35, "j": 80, "k": 100, "l": 80,
+                     "m": 100, "n": 100, "o": 100, "p": 120,
+                     "q": 120, "r": 130, "s": 110, "t": 90,
+                     "u": 100, "v": 100, "w": 100, "x": 130,
+                     "y": 120, "z": 120
+                     }
+        target_cp = {k: v * level_of_cc for k, v in target_cp.items()}
+    print(f"======= {font} =======")
+    font_path = f"code/data/fonts/{font}.ttf"
+    init_path = f"code/data/init"
+    subdivision_thresh = None
+    font_string_to_svgs(init_path, font_path, word, target_control=target_cp,
+                        subdivision_thresh=subdivision_thresh)
+    normalize_letter_size(init_path, font_path, word)
+    # optimaize two adjacent letters
+    if len(letter) > 1:
+        subdivision_thresh = None
+        font_string_to_svgs(init_path, font_path, letter, target_control=target_cp,
+                            subdivision_thresh=subdivision_thresh)
+        normalize_letter_size(init_path, font_path, letter)
+    print("Done preprocess")
+def get_data_augs(cut_size):
+    augmentations = []
+    augmentations.append(K.RandomPerspective(distortion_scale=0.5, p=0.7))
+    augmentations.append(K.RandomCrop(size=(cut_size, cut_size), pad_if_needed=True, padding_mode='reflect', p=1.0))
+    return nn.Sequential(*augmentations)
+'''pytorch adaptation of https://github.com/google/mipnerf'''
+def learning_rate_decay(step,
+                        lr_init,
+                        lr_final,
+                        max_steps,
+                        lr_delay_steps=0,
+                        lr_delay_mult=1):
+  """Continuous learning rate decay function.
+  The returned rate is lr_init when step=0 and lr_final when step=max_steps, and
+  is log-linearly interpolated elsewhere (equivalent to exponential decay).
+  If lr_delay_steps>0 then the learning rate will be scaled by some smooth
+  function of lr_delay_mult, such that the initial learning rate is
+  lr_init*lr_delay_mult at the beginning of optimization but will be eased back
+  to the normal learning rate when steps>lr_delay_steps.
+  Args:
+    step: int, the current optimization step.
+    lr_init: float, the initial learning rate.
+    lr_final: float, the final learning rate.
+    max_steps: int, the number of steps during optimization.
+    lr_delay_steps: int, the number of steps to delay the full learning rate.
+    lr_delay_mult: float, the multiplier on the rate when delaying it.
+  Returns:
+    lr: the learning for current step 'step'.
+  """
+  if lr_delay_steps > 0:
+    # A kind of reverse cosine decay.
+    delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
+        0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1))
+  else:
+    delay_rate = 1.
+  t = np.clip(step / max_steps, 0, 1)
+  log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
+  return delay_rate * log_lerp
+def save_image(img, filename, gamma=1):
+    check_and_create_dir(filename)
+    imshow = img.detach().cpu()
+    pydiffvg.imwrite(imshow, filename, gamma=gamma)
+def get_letter_ids(letter, word, shape_groups):
+    for group, l in zip(shape_groups, word):
+        if l == letter:
+            return group.shape_ids
+def combine_word(word, letter, font, experiment_dir):
+    word_svg_scaled = f"./code/data/init/{font}_{word}_scaled.svg"
+    canvas_width_word, canvas_height_word, shapes_word, shape_groups_word = pydiffvg.svg_to_scene(word_svg_scaled)
+    letter_ids = []
+    for l in letter:
+        letter_ids += get_letter_ids(l, word, shape_groups_word)
+    w_min, w_max = min([torch.min(shapes_word[ids].points[:, 0]) for ids in letter_ids]), max(
+        [torch.max(shapes_word[ids].points[:, 0]) for ids in letter_ids])
+    h_min, h_max = min([torch.min(shapes_word[ids].points[:, 1]) for ids in letter_ids]), max(
+        [torch.max(shapes_word[ids].points[:, 1]) for ids in letter_ids])
+    c_w = (-w_min + w_max) / 2
+    c_h = (-h_min + h_max) / 2
+    svg_result = os.path.join(experiment_dir, "output-svg", "output.svg")
+    canvas_width, canvas_height, shapes, shape_groups = pydiffvg.svg_to_scene(svg_result)
+    out_w_min, out_w_max = min([torch.min(p.points[:, 0]) for p in shapes]), max(
+        [torch.max(p.points[:, 0]) for p in shapes])
+    out_h_min, out_h_max = min([torch.min(p.points[:, 1]) for p in shapes]), max(
+        [torch.max(p.points[:, 1]) for p in shapes])
+    out_c_w = (-out_w_min + out_w_max) / 2
+    out_c_h = (-out_h_min + out_h_max) / 2
+    scale_canvas_w = (w_max - w_min) / (out_w_max - out_w_min)
+    scale_canvas_h = (h_max - h_min) / (out_h_max - out_h_min)
+    if scale_canvas_h > scale_canvas_w:
+        wsize = int((out_w_max - out_w_min) * scale_canvas_h)
+        scale_canvas_w = wsize / (out_w_max - out_w_min)
+        shift_w = -out_c_w * scale_canvas_w + c_w
+    else:
+        hsize = int((out_h_max - out_h_min) * scale_canvas_w)
+        scale_canvas_h = hsize / (out_h_max - out_h_min)
+        shift_h = -out_c_h * scale_canvas_h + c_h
+    for num, p in enumerate(shapes):
+        p.points[:, 0] = p.points[:, 0] * scale_canvas_w
+        p.points[:, 1] = p.points[:, 1] * scale_canvas_h
+        if scale_canvas_h > scale_canvas_w:
+            p.points[:, 0] = p.points[:, 0] - out_w_min * scale_canvas_w + w_min + shift_w
+            p.points[:, 1] = p.points[:, 1] - out_h_min * scale_canvas_h + h_min
+        else:
+            p.points[:, 0] = p.points[:, 0] - out_w_min * scale_canvas_w + w_min
+            p.points[:, 1] = p.points[:, 1] - out_h_min * scale_canvas_h + h_min + shift_h
+    for j, s in enumerate(letter_ids):
+        shapes_word[s] = shapes[j]
+    save_svg.save_svg(
+        f"{experiment_dir}/{font}_{word}_{letter}.svg", canvas_width, canvas_height, shapes_word,
+        shape_groups_word)
+    render = pydiffvg.RenderFunction.apply
+    scene_args = pydiffvg.RenderFunction.serialize_scene(canvas_width, canvas_height, shapes_word, shape_groups_word)
+    img = render(canvas_width, canvas_height, 2, 2, 0, None, *scene_args)
+    img = img[:, :, 3:4] * img[:, :, :3] + \
+               torch.ones(img.shape[0], img.shape[1], 3, device="cuda:0") * (1 - img[:, :, 3:4])
+    img = img[:, :, :3]
+    save_image(img, f"{experiment_dir}/{font}_{word}_{letter}.png")
+def create_video(num_iter, experiment_dir, video_frame_freq):
+    img_array = []
+    for ii in range(0, num_iter):
+        if ii % video_frame_freq == 0 or ii == num_iter - 1:
+            filename = os.path.join(
+                experiment_dir, "video-png", f"iter{ii:04d}.png")
+            img = cv2.imread(filename)
+            img_array.append(img)
+    video_name = os.path.join(
+        experiment_dir, "video.mp4")
+    check_and_create_dir(video_name)
+    out = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), 30.0, (600, 600))
+    for iii in range(len(img_array)):
+        out.write(img_array[iii])
+    out.release()

coming_soon.png ADDED Viewed

Git LFS Details

SHA256: cccc22b97840a5007871339b4061934c6554f419f4366a60e22b009cc72b0bde
Pointer size: 131 Bytes
Size of remote file: 127 kB

images/HobeauxRococeaux-Sherman_NATURE_T.svg ADDED Viewed

images/KaushanScript-Regular_BUNNY_Y.svg ADDED Viewed

images/teaser.png ADDED Viewed

Git LFS Details

SHA256: 3e40d0508fb77a4c2a47cb8aa38c07b10eff75480af239a50eecba9479c1bcdc
Pointer size: 130 Bytes
Size of remote file: 69.7 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,148 @@

+accelerate==1.0.1
+annotated-types==0.7.0
+anyio==4.5.2
+Brotli @ file:///croot/brotli-split_1714483155106/work
+certifi==2025.6.15
+charset-normalizer==3.4.2
+click @ file:///croot/click_1698129812380/work
+cloudpickle @ file:///croot/cloudpickle_1721657346512/work
+coloredlogs==15.0.1
+contourpy==1.1.1
+cssutils==2.11.1
+cycler==0.12.1
+cytoolz @ file:///croot/cytoolz_1701723583781/work
+dask @ file:///croot/dask-core_1683065217061/work
+diffusers==0.29.2
+diffvg==0.0.1
+dnspython==2.6.1
+easydict==1.13
+email_validator==2.2.0
+eval_type_backport==0.2.2
+exceptiongroup==1.3.0
+fastapi==0.115.13
+fastapi-cli==0.0.7
+filelock==3.16.1
+fonttools==4.57.0
+freetype-py==2.5.1
+fsspec==2025.3.0
+ftfy==6.2.3
+git-lfs==1.6
+gitdb==4.0.12
+GitPython==3.1.44
+greenlet==3.1.1
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
+httptools==0.6.4
+httpx==0.28.1
+huggingface_hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1733636895686/work
+humanfriendly==10.0
+idna==3.10
+imagecodecs @ file:///croot/imagecodecs_1695064943445/work
+imageio @ file:///croot/imageio_1707247282708/work
+imageio-ffmpeg==0.5.1
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jsonpatch==1.33
+jsonpointer==3.0.0
+kiwisolver==1.4.7
+kornia==0.6.8
+llvmlite==0.41.1
+locket @ file:///opt/conda/conda-bld/locket_1652903118915/work
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.7.5
+mdurl==0.1.2
+mkl-fft @ file:///croot/mkl_fft_1695058164594/work
+mkl-random @ file:///croot/mkl_random_1695059800811/work
+mkl-service==2.4.0
+more-itertools==10.5.0
+mpmath==1.3.0
+networkx==3.1
+numba==0.58.1
+numpy==1.24.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.9.86
+nvidia-nvtx-cu12==12.1.105
+opencv-python==4.5.4.60
+orjson==3.10.15
+packaging==25.0
+pandas==2.0.3
+partd @ file:///croot/partd_1698702562572/work
+pillow==10.4.0
+platformdirs @ file:///croot/platformdirs_1692205439124/work
+pooch @ file:///croot/pooch_1695850093751/work
+protobuf==5.29.5
+psutil==7.0.0
+pyaml==25.5.0
+pydantic==2.10.6
+pydantic-extra-types==2.10.5
+pydantic-settings==2.8.1
+pydantic_core==2.27.2
+Pygments==2.19.2
+pyparsing==3.1.4
+PySocks @ file:///tmp/build/80754af9/pysocks_1605305779399/work
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.20
+pytz==2025.2
+PyWavelets @ file:///croot/pywavelets_1670425177960/work
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+rich-toolkit==0.14.7
+safetensors==0.5.3
+scikit-fmm==2024.5.29
+scikit-image @ file:///croot/scikit-image_1669241743693/work
+scipy==1.10.1
+seaborn==0.13.2
+sentry-sdk==2.31.0
+setproctitle==1.3.6
+shapely==2.0.7
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+starlette==0.44.0
+svgpathtools==1.7.1
+svgwrite==1.4.3
+sympy==1.13.3
+tifffile @ file:///croot/tifffile_1695107451082/work
+tokenizers==0.20.3
+toolz @ file:///croot/toolz_1667464077321/work
+torch==2.4.1
+torch-tools==0.1.5
+torchaudio==2.4.1+cu121
+torchvision==0.19.1+cu121
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.46.3
+triton==3.0.0
+typer==0.16.0
+typing_extensions==4.13.2
+tzdata==2025.2
+ujson==5.10.0
+urllib3==2.2.3
+uvicorn==0.33.0
+uvloop==0.21.0
+visdom==0.2.4
+wandb==0.20.1
+watchfiles==0.24.0
+wcwidth==0.2.13
+websocket-client==1.8.0
+websockets==13.1
+xformers==0.0.28.post1
+zipp==3.20.2

rest_api.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import wai_service
+app = FastAPI()
+class InferenceRequest(BaseModel):
+    word: str
+    optimized_letter: str
+    font: str = "KaushanScript-Regular"
+    seed: int = 0
+    class Config:
+        extra = "allow"
+@app.post("/generate")
+def generate(req: InferenceRequest):
+    try:
+        img_b64 = wai_service.handler(req.dict())
+        return {"image_base64": img_b64}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

run_word_as_image.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+set -e
+USE_WANDB=0 # CHANGE IF YOU WANT WANDB
+WANDB_USER="none"
+EXPERIMENT=conformal_0.5_dist_pixel_100_kernel201
+CONCEPT=BUNNY
+WORD=BUNNY
+fonts=(KaushanScript-Regular)
+for j in "${fonts[@]}"
+do
+    letter_=("Y")
+    SEED=0
+    for i in "${letter_[@]}"
+    do
+        echo "$i"
+        font_name=$j
+        ARGS="--experiment $EXPERIMENT --optimized_letter ${i} --seed $SEED --font ${font_name} --use_wandb ${USE_WANDB} --wandb_user ${WANDB_USER}"
+        CUDA_VISIBLE_DEVICES=0 python code/main.py $ARGS --semantic_concept "${CONCEPT}" --word "${WORD}"
+    done
+done

wai_service.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# --- wai_service.py (final handler) ---------------------------------
+import base64, sys, os, torch
+sys.path.append(os.path.join(os.path.dirname(__file__), "code"))
+from code.config import set_config
+from code.main import generate_word_image
+from easydict import EasyDict
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# flags that *are* recognised by code/config.parse_args()
+KNOWN_CLI_KEYS = {
+    "word",
+    "optimized_letter",
+    "font",
+    "seed",
+    "experiment",
+    "use_wandb",
+    "wandb_user",
+}
+def _sanitize(cfg):
+    """
+    Recursively walk an EasyDict / dict and replace every Ellipsis (`...`)
+    with None.  Returns the same object (in-place).
+    """
+    if isinstance(cfg, dict):
+        for k, v in cfg.items():
+            if v is Ellipsis:
+                cfg[k] = None
+            else:
+                _sanitize(v)
+    return cfg
+def handler(payload: dict) -> str:
+    # 1️⃣  Build fake argv *only* from recognised keys
+    cli_argv = [sys.argv[0]]
+    for k in KNOWN_CLI_KEYS & payload.keys():
+        cli_argv += [f"--{k}", str(payload[k])]
+    orig_argv = sys.argv[:]
+    try:
+        sys.argv = cli_argv
+        cfg = set_config()           # EasyDict with YAML + CLI-compatible fields
+    finally:
+        sys.argv = orig_argv
+    _sanitize(cfg)
+    # 2️⃣  Overlay ALL payload keys (new ones like render_size stick)
+    for k, v in payload.items():
+        setattr(cfg, k, v)
+    # sensible defaults
+    cfg.render_size = getattr(cfg, "render_size", 384)   # <= set here
+    cfg.word = cfg.word.upper()
+    cfg.optimized_letter = getattr(cfg, "optimized_letter", cfg.word[-1])
+    if getattr(cfg.diffusion, "model", ...) is Ellipsis:
+        cfg.diffusion.model = "runwayml/stable-diffusion-v1-5"
+    # 3️⃣  Run optimisation
+    out_path = generate_word_image(cfg, device)
+    # 4️⃣  Return base-64
+    with open(out_path, "rb") as f:
+        return base64.b64encode(f.read()).decode()
+# --------------------------------------------------------------------